main/doxygen_doc/phase_8F90_source.html

!! Copyright (C) 2009 X. Andrade

!! Copyright (C) 2024 N. Tancogne-Dejean

!!

!! This program is free software; you can redistribute it and/or modify

!! it under the terms of the GNU General Public License as published by

!! the Free Software Foundation; either version 2, or (at your option)

!! any later version.

!!

!! This program is distributed in the hope that it will be useful,

!! but WITHOUT ANY WARRANTY; without even the implied warranty of

!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!! GNU General Public License for more details.

!!

!! You should have received a copy of the GNU General Public License

!! along with this program; if not, write to the Free Software

!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA

!! 02110-1301, USA.

!!


#include "global.h"


module phase_oct_m

  use accel_oct_m

  use batch_oct_m

  use batch_ops_oct_m

  use boundaries_oct_m

  use comm_oct_m

  use debug_oct_m

  use derivatives_oct_m

  use distributed_oct_m

  use electron_space_oct_m

  use global_oct_m

  use grid_oct_m

  use hardware_oct_m

  use kpoints_oct_m

  use math_oct_m

  use mesh_oct_m

  use messages_oct_m

  use mpi_oct_m

  use profiling_oct_m

  use space_oct_m

  use states_elec_oct_m

  use states_elec_dim_oct_m

  use submesh_oct_m

  use types_oct_m

  use wfs_elec_oct_m


  implicit none


  private


  public ::                               &

    phase_t,                              &

    phase_accel_rebuild


  type phase_t

    private

    complex(real64),         allocatable  :: phase(:, :)

    complex(real64), public,           allocatable  :: phase_corr(:,:)

    complex(real64),         allocatable  :: phase_spiral(:,:)

    type(accel_mem_t)                      :: buff_phase

    type(accel_mem_t)                      :: buff_phase_spiral

    type(accel_mem_t), public              :: buff_phase_corr

    integer                                :: buff_phase_qn_start

    real(real64), public,            pointer      :: spin(:,:,:) => null()

  contains

    procedure :: init => phase_init_phases


    procedure :: update => phase_update_phases


    procedure :: end => phase_end


    procedure :: set_phase_corr => phase_set_phase_corr


    procedure :: unset_phase_corr => phase_unset_phase_corr


    procedure :: apply_to => phase_apply_batch


    procedure :: apply_to_single => phase_apply_mf


    procedure :: apply_phase_spiral => phase_phase_spiral


    procedure :: is_allocated => phase_is_allocated


    procedure :: copy_and_set_phase => phase_copy_and_set_phase

  end type phase_t


contains


  ! ---------------------------------------------------------

  subroutine phase_init_phases(phase, gr, kpt, kpoints, d, space)

    class(phase_t),          intent(inout) :: phase

    class(mesh_t),           intent(in)    :: gr

    type(distributed_t),     intent(in)    :: kpt

    type(kpoints_t),         intent(in)    :: kpoints

    type(states_elec_dim_t), intent(in)    :: d

    type(space_t),           intent(in)    :: space


    integer :: ip, ik, sp

    integer(int64) :: ip_inner_global

    real(real64)   :: kpoint(space%dim), x_global(space%dim)


    push_sub(phase_init_phases)


    ! no e^ik phase needed for Gamma-point-only periodic calculations

    ! unless for velocity-gauge for lasers

    if (accel_is_enabled()) then

      phase%buff_phase_qn_start = kpt%start

    end if

    if(kpoints%gamma_only()) then

      pop_sub(phase_init_phases)

      return

    end if


    safe_allocate(phase%phase(1:gr%np_part, kpt%start:kpt%end))

    safe_allocate(phase%phase_corr(gr%np+1:gr%np_part, kpt%start:kpt%end))

    !$omp parallel private(ip)

    do ik = kpt%start, kpt%end

      !$omp do

      do ip = gr%np + 1, gr%np_part

        phase%phase_corr(ip, ik) = m_one

      end do

      !$omp end do nowait

    end do

    !$omp end parallel


    ! Only when gr is a grid_t type, we can access gr%der

    select type(gr)

    class is(grid_t)

      if (gr%der%boundaries%spiralBC) then

        sp = gr%np

        if (gr%parallel_in_domains) sp = gr%np + gr%pv%np_ghost


        ! We decided to allocate the array from 1:np_part-sp as this is less error prone when passing

        ! the array to other routines, or in particular creating a C-style pointer from phase_spiral(1,1).

        ! We will also update phase_corr and possible other similar arrays.


        safe_allocate(phase%phase_spiral(1:gr%np_part-sp, 1:2))


        ! loop over boundary points

        do ip = sp + 1, gr%np_part

          ! get corresponding inner point

          ip_inner_global = mesh_periodic_point(gr, space, ip)

          x_global = mesh_x_global(gr, ip_inner_global)

          phase%phase_spiral(ip-sp, 1) = &

            exp(m_zi * sum((gr%x(1:space%dim, ip)-x_global(1:space%dim)) * gr%der%boundaries%spiral_q(1:space%dim)))

          phase%phase_spiral(ip-sp, 2) = &

            exp(-m_zi * sum((gr%x(1:space%dim, ip)-x_global(1:space%dim)) * gr%der%boundaries%spiral_q(1:space%dim)))

        end do


        if (accel_is_enabled()) then

          call accel_create_buffer(phase%buff_phase_spiral, accel_mem_read_only, type_cmplx, (gr%np_part-sp)*2)

          call accel_write_buffer(phase%buff_phase_spiral, gr%np_part-sp, 2, phase%phase_spiral)

        end if

      end if

    class default

      ! Do nothing

    end select


    kpoint(1:space%dim) = m_zero


    sp = gr%np

    if (gr%parallel_in_domains) sp = gr%np + gr%pv%np_ghost


    !$omp parallel private(ip, ip_inner_global, x_global, kpoint)

    do ik = kpt%start, kpt%end

      kpoint(1:space%dim) = kpoints%get_point(d%get_kpoint_index(ik))

      !$omp do

      do ip = 1, gr%np_part

        phase%phase(ip, ik) = exp(-m_zi * sum(gr%x(1:space%dim, ip) * kpoint(1:space%dim)))

      end do

      !$omp end do


      ! loop over boundary points

      !$omp do

      do ip = sp + 1, gr%np_part

        ! get corresponding inner point

        ip_inner_global = mesh_periodic_point(gr, space, ip)


        ! compute phase correction from global coordinate (opposite sign!)

        x_global = mesh_x_global(gr, ip_inner_global)

        phase%phase_corr(ip, ik) = phase%phase(ip, ik)* &

          exp(m_zi * sum(x_global(1:space%dim) * kpoint(1:space%dim)))

      end do

      !$omp end do nowait

    end do

    !$omp end parallel


    if (accel_is_enabled()) then

      call accel_create_buffer(phase%buff_phase, accel_mem_read_write, type_cmplx, gr%np_part*kpt%nlocal)

      call accel_write_buffer(phase%buff_phase, gr%np_part, kpt%nlocal, phase%phase)

      call accel_create_buffer(phase%buff_phase_corr, accel_mem_read_write, type_cmplx, (gr%np_part - gr%np)*kpt%nlocal)

      call accel_write_buffer(phase%buff_phase_corr, gr%np_part - gr%np, kpt%nlocal, phase%phase_corr)

    end if


    pop_sub(phase_init_phases)

  end subroutine phase_init_phases


  ! ----------------------------------------------------------------------------------

  subroutine phase_update_phases(phase, mesh, kpt, kpoints, d, space, uniform_vector_potential)

    class(phase_t),      intent(inout) :: phase

    class(mesh_t),       intent(in)    :: mesh

    type(distributed_t), intent(in)    :: kpt

    type(kpoints_t),     intent(in)    :: kpoints

    type(states_elec_dim_t), intent(in)  :: d

    type(space_t),       intent(in)    :: space

    real(real64), allocatable,  intent(in)    :: uniform_vector_potential(:)


    integer :: ik, ip, sp

    integer(int64), dimension(2) :: np, gsize, bsize

    integer(int64) :: ip_inner_global

    real(real64)   :: kpoint(space%dim)

    real(real64), allocatable :: x_global(:,:), kpt_vec_pot(:,:)

    type(accel_mem_t) :: buff_vec_pot, buff_x_global, buff_x

    type(accel_kernel_t), save :: kernel

    real(real64) :: tmp_sum


    if (.not. allocated(uniform_vector_potential)) return


    push_sub_with_profile(phase_update_phases)


    if (.not. allocated(phase%phase)) then

      safe_allocate(phase%phase(1:mesh%np_part, kpt%start:kpt%end))

      if (accel_is_enabled()) then

        call accel_create_buffer(phase%buff_phase, accel_mem_read_write, type_cmplx, &

          mesh%np_part*kpt%nlocal)

      end if

    end if


    if (.not. allocated(phase%phase_corr)) then

      safe_allocate(phase%phase_corr(mesh%np+1:mesh%np_part, kpt%start:kpt%end))

      if (accel_is_enabled()) then

        call accel_create_buffer(phase%buff_phase_corr, accel_mem_read_write, type_cmplx, &

          (mesh%np_part - mesh%np)*kpt%nlocal)

      end if

    end if


    ! TODO: We should not recompute this every time-step. We should store it.


    ! loop over boundary points

    sp = mesh%np

    ! skip ghost points

    if (mesh%parallel_in_domains) sp = mesh%np + mesh%pv%np_ghost


    safe_allocate(x_global(1:space%dim,(sp + 1):mesh%np_part))


    !$omp parallel do schedule(static) private(ip_inner_global)

    do ip = sp + 1, mesh%np_part

      ! get corresponding inner point

      ip_inner_global = mesh_periodic_point(mesh, space, ip)

      ! compute the difference between the global coordinate and the local coordinate

      x_global(:,ip) = mesh_x_global(mesh, ip_inner_global) - mesh%x(1:space%dim, ip)

    end do


    if (.not. accel_is_enabled()) then

      !$omp parallel private(ik, ip, kpoint, tmp_sum)

      do ik = kpt%start, kpt%end

        kpoint(1:space%dim) = kpoints%get_point(d%get_kpoint_index(ik))

        !We add the vector potential

        kpoint(1:space%dim) = kpoint(1:space%dim) + uniform_vector_potential(1:space%dim)


        !$omp do schedule(static)

        do ip = 1, mesh%np_part

          tmp_sum = sum(mesh%x(1:space%dim, ip)*kpoint(1:space%dim))

          phase%phase(ip, ik) = cmplx(cos(tmp_sum), -sin(tmp_sum), real64)

        end do

        !$omp end do


        !$omp do schedule(static)

        do ip = sp + 1, mesh%np_part

          tmp_sum = sum(x_global(1:space%dim, ip)*kpoint(1:space%dim))

          phase%phase_corr(ip, ik) = cmplx(cos(tmp_sum), sin(tmp_sum), real64)

        end do

        !$omp end do nowait

      end do

      !$omp end parallel


    else !accel_is enabled


      call accel_create_buffer(buff_vec_pot, accel_mem_read_only, type_float, space%dim*kpt%nlocal)

      safe_allocate(kpt_vec_pot(1:space%dim,kpt%start:kpt%end))

      do ik = kpt%start, kpt%end

        kpoint(1:space%dim) = kpoints%get_point(d%get_kpoint_index(ik))

        kpt_vec_pot(1:space%dim, ik) = kpoint(1:space%dim) + uniform_vector_potential(1:space%dim)

      end do

      call accel_write_buffer(buff_vec_pot, space%dim, kpt%nlocal, kpt_vec_pot, async=.true.)


      ! Note: this should be globally stored

      call accel_create_buffer(buff_x, accel_mem_read_only, type_float, space%dim*(mesh%np_part))

      call accel_write_buffer(buff_x, space%dim, mesh%np_part, mesh%x, async=.true.)


      call accel_create_buffer(buff_x_global, accel_mem_read_only, type_float, space%dim*(mesh%np_part-sp))

      call accel_write_buffer(buff_x_global, space%dim, mesh%np_part-sp, x_global(1:space%dim,(sp + 1):mesh%np_part), async=.true.)


      call accel_kernel_start_call(kernel, 'phase.cu', 'update_phases')


      call accel_set_kernel_arg(kernel, 0, space%dim)

      call accel_set_kernel_arg(kernel, 1, mesh%np)

      call accel_set_kernel_arg(kernel, 2, mesh%np_part)

      call accel_set_kernel_arg(kernel, 3, kpt%start)

      call accel_set_kernel_arg(kernel, 4, kpt%end)

      call accel_set_kernel_arg(kernel, 5, sp)

      call accel_set_kernel_arg(kernel, 6, buff_vec_pot)

      call accel_set_kernel_arg(kernel, 7, buff_x)

      call accel_set_kernel_arg(kernel, 8, buff_x_global)

      call accel_set_kernel_arg(kernel, 9, phase%buff_phase)

      call accel_set_kernel_arg(kernel, 10, phase%buff_phase_corr)


      ! Compute the grid size

      np = (/mesh%np_part, kpt%nlocal/)

      bsize = (/accel_kernel_block_size(kernel), 1/)

      call accel_grid_size(np, bsize, gsize)


      call accel_kernel_run(kernel, gsize, bsize)


      call accel_read_buffer(phase%buff_phase, mesh%np_part, kpt%nlocal, phase%phase, async=.true.)

      call accel_read_buffer(phase%buff_phase_corr, mesh%np_part - mesh%np, kpt%nlocal, phase%phase_corr)


      call accel_free_buffer(buff_vec_pot)

      call accel_free_buffer(buff_x)

      call accel_free_buffer(buff_x_global)

      safe_deallocate_a(kpt_vec_pot)

    end if


    safe_deallocate_a(x_global)


    pop_sub_with_profile(phase_update_phases)

  end subroutine phase_update_phases


  ! ----------------------------------------------------------------------------------

  subroutine phase_end(phase)

    class(phase_t),  intent(inout) :: phase


    push_sub(phase_end)


    if (phase%is_allocated() .and. accel_is_enabled()) then

      call accel_free_buffer(phase%buff_phase)

      call accel_free_buffer(phase%buff_phase_corr)

    end if


    if (allocated(phase%phase_spiral) .and. accel_is_enabled()) then

      call accel_free_buffer(phase%buff_phase_spiral)

    end if


    safe_deallocate_a(phase%phase)

    safe_deallocate_a(phase%phase_corr)

    safe_deallocate_a(phase%phase_spiral)


    pop_sub(phase_end)

  end subroutine phase_end


  ! ----------------------------------------------------------------------------------

  subroutine phase_accel_rebuild(phase, mesh, kpt)

    class(phase_t),      intent(inout) :: phase

    class(mesh_t),       intent(in)    :: mesh

    type(distributed_t), intent(in)    :: kpt


    integer :: nlocal


    push_sub(phase_accel_rebuild)


    if (.not. accel_is_enabled()) then

      pop_sub(phase_accel_rebuild)

      return

    end if


    phase%buff_phase_qn_start = kpt%start


    call accel_detach_buffer(phase%buff_phase)

    call accel_detach_buffer(phase%buff_phase_corr)

    call accel_detach_buffer(phase%buff_phase_spiral)


    if (allocated(phase%phase)) then

      assert(size(phase%phase, 1) == mesh%np_part)

      nlocal = ubound(phase%phase, dim=2) - lbound(phase%phase, dim=2) + 1

      call accel_create_buffer(phase%buff_phase, accel_mem_read_write, type_cmplx, size(phase%phase, 1)*nlocal)

      call accel_write_buffer(phase%buff_phase, size(phase%phase, 1), nlocal, phase%phase)

    end if


    if (allocated(phase%phase_corr)) then

      assert(size(phase%phase_corr, 1) == mesh%np_part - mesh%np)

      nlocal = ubound(phase%phase_corr, dim=2) - lbound(phase%phase_corr, dim=2) + 1

      call accel_create_buffer(phase%buff_phase_corr, accel_mem_read_write, type_cmplx, &

        size(phase%phase_corr, 1)*nlocal)

      call accel_write_buffer(phase%buff_phase_corr, size(phase%phase_corr, 1), nlocal, phase%phase_corr)

    end if


    if (allocated(phase%phase_spiral)) then

      call accel_create_buffer(phase%buff_phase_spiral, accel_mem_read_only, type_cmplx, &

        size(phase%phase_spiral, 1)*size(phase%phase_spiral, 2))

      call accel_write_buffer(phase%buff_phase_spiral, size(phase%phase_spiral, 1), &

        size(phase%phase_spiral, 2), phase%phase_spiral)

    end if


    pop_sub(phase_accel_rebuild)

  end subroutine phase_accel_rebuild


  ! ----------------------------------------------------------------------------------

  !

  subroutine phase_set_phase_corr(phase, mesh, psib, async)

    class(phase_t),                intent(in) :: phase

    class(mesh_t),                 intent(in) :: mesh

    type(wfs_elec_t),           intent(inout) :: psib

    logical, optional,             intent(in) :: async


    logical :: phase_correction


    push_sub(phase_set_phase_corr)


    ! check if we only want a phase correction for the boundary points

    phase_correction = phase%is_allocated()


    !We apply the phase only to np points, and the phase for the np+1 to np_part points

    !will be treated as a phase correction in the Hamiltonian

    if (phase_correction) then

      call phase%apply_to(mesh, mesh%np, .false., psib, async=async)

    end if


    pop_sub(phase_set_phase_corr)

  end subroutine phase_set_phase_corr


  ! ----------------------------------------------------------------------------------

  !

  subroutine phase_unset_phase_corr(phase, mesh, psib, async)

    class(phase_t),                intent(in) :: phase

    class(mesh_t),                 intent(in) :: mesh

    type(wfs_elec_t),           intent(inout) :: psib

    logical, optional,             intent(in) :: async


    logical :: phase_correction


    push_sub(phase_unset_phase_corr)


    ! check if we only want a phase correction for the boundary points

    phase_correction = phase%is_allocated()


    !We apply the phase only to np points, and the phase for the np+1 to np_part points

    !will be treated as a phase correction in the Hamiltonian

    if (phase_correction) then

      call phase%apply_to(mesh, mesh%np, .true., psib, async=async)

    end if


    pop_sub(phase_unset_phase_corr)

  end subroutine phase_unset_phase_corr


  ! ---------------------------------------------------------------------------------------

  !

  subroutine phase_apply_batch(this, mesh, np, conjugate, psib, src, async)

    class(phase_t),                        intent(in)    :: this

    class(mesh_t),                         intent(in)    :: mesh

    integer,                               intent(in)    :: np

    logical,                               intent(in)    :: conjugate

    type(wfs_elec_t),              target, intent(inout) :: psib

    type(wfs_elec_t),    optional, target, intent(in)    :: src

    logical, optional,                     intent(in)    :: async


    integer :: ip, ii, sp

    type(wfs_elec_t), pointer :: src_

    complex(real64) :: phase

    integer(int64), dimension(3) :: gsizes, bsizes

    type(accel_kernel_t), save :: ker_phase


    push_sub(phase_apply_batch)

    call profiling_in("PHASE_APPLY_BATCH")


    call profiling_count_operations(6*np*psib%nst_linear)


    assert(np <= mesh%np_part)

    assert(psib%type() == type_cmplx)

    assert(psib%ik >= lbound(this%phase, dim=2))

    assert(psib%ik <= ubound(this%phase, dim=2))


    src_ => psib

    if (present(src)) src_ => src


    assert(src_%has_phase .eqv. conjugate)

    assert(src_%ik == psib%ik)

    assert(src_%type() == type_cmplx)


    ! We want to skip the ghost points for setting the phase

    sp = min(np, mesh%np)

    if (np > mesh%np .and. mesh%parallel_in_domains) sp = mesh%np + mesh%pv%np_ghost


    select case (psib%status())

    case (batch_packed)


      if (conjugate) then


        !$omp parallel private(ii, phase)

        !$omp do

        do ip = 1, min(mesh%np, np)

          phase = conjg(this%phase(ip, psib%ik))

          !$omp simd

          do ii = 1, psib%nst_linear

            psib%zff_pack(ii, ip) = phase*src_%zff_pack(ii, ip)

          end do

        end do

        !$omp end do nowait


        ! Boundary points, if requested

        !$omp do

        do ip = sp+1, np

          phase = conjg(this%phase(ip, psib%ik))

          !$omp simd

          do ii = 1, psib%nst_linear

            psib%zff_pack(ii, ip) = phase*src_%zff_pack(ii, ip)

          end do

        end do

        !$omp end parallel


      else


        !$omp parallel private(ii, phase)

        !$omp do

        do ip = 1, min(mesh%np, np)

          phase = this%phase(ip, psib%ik)

          !$omp simd

          do ii = 1, psib%nst_linear

            psib%zff_pack(ii, ip) = phase*src_%zff_pack(ii, ip)

          end do

        end do

        !$omp end do nowait


        ! Boundary points, if requested

        !$omp do

        do ip = sp+1, np

          phase = this%phase(ip, psib%ik)

          !$omp simd

          do ii = 1, psib%nst_linear

            psib%zff_pack(ii, ip) = phase*src_%zff_pack(ii, ip)

          end do

        end do

        !$omp end parallel


      end if


    case (batch_not_packed)


      if (conjugate) then


        !$omp parallel private(ii, ip)

        do ii = 1, psib%nst_linear

          !$omp do simd

          do ip = 1, min(mesh%np, np)

            psib%zff_linear(ip, ii) = conjg(this%phase(ip, psib%ik))*src_%zff_linear(ip, ii)

          end do

          !$omp end do simd nowait


          ! Boundary points, if requested

          !$omp do simd

          do ip = sp+1, np

            psib%zff_linear(ip, ii) = conjg(this%phase(ip, psib%ik))*src_%zff_linear(ip, ii)

          end do

          !$omp end do simd nowait

        end do

        !$omp end parallel


      else

        !$omp parallel private(ii, ip)

        do ii = 1, psib%nst_linear

          !$omp do simd

          do ip = 1, min(mesh%np, np)

            psib%zff_linear(ip, ii) = this%phase(ip, psib%ik)*src_%zff_linear(ip, ii)

          end do

          !$omp end do simd nowait


          ! Boundary points, if requested

          !$omp do simd

          do ip = sp+1, np

            psib%zff_linear(ip, ii) = this%phase(ip, psib%ik)*src_%zff_linear(ip, ii)

          end do

          !$omp end do simd nowait


        end do

        !$omp end parallel


      end if


    case (batch_device_packed)

      call accel_kernel_start_call(ker_phase, 'phase.cu', 'phase_hamiltonian')


      if (conjugate) then

        call accel_set_kernel_arg(ker_phase, 0, 1_4)

      else

        call accel_set_kernel_arg(ker_phase, 0, 0_4)

      end if


      call accel_set_kernel_arg(ker_phase, 1, (psib%ik - this%buff_phase_qn_start)*mesh%np_part)

      call accel_set_kernel_arg(ker_phase, 2, np)

      call accel_set_kernel_arg(ker_phase, 3, this%buff_phase)

      call accel_set_kernel_arg(ker_phase, 4, src_%ff_device)

      call accel_set_kernel_arg(ker_phase, 5, log2(int(src_%pack_size(1), int32)))

      call accel_set_kernel_arg(ker_phase, 6, psib%ff_device)

      call accel_set_kernel_arg(ker_phase, 7, log2(int(psib%pack_size(1), int32)))


      ! Compute the grid (extend to another dimensions if the size of the problem is too big)

      call accel_grid_size_extend_dim(int(np, int64), psib%pack_size(1), gsizes, bsizes, ker_phase)


      call accel_kernel_run(ker_phase, gsizes, bsizes)


      if(.not. optional_default(async, .false.)) call accel_finish()

    end select


    psib%has_phase = .not. conjugate


    call profiling_out("PHASE_APPLY_BATCH")

    pop_sub(phase_apply_batch)

  end subroutine phase_apply_batch


  !

  subroutine phase_apply_mf(this, psi, np, dim, ik, conjugate)

    class(phase_t),         intent(in)    :: this

    complex(real64),     intent(inout)    :: psi(:, :)

    integer,                intent(in)    :: np

    integer,                intent(in)    :: dim

    integer,                intent(in)    :: ik

    logical,                intent(in)    :: conjugate


    integer :: idim, ip


    push_sub(phase_apply_mf)


    assert(ik >= lbound(this%phase, dim=2))

    assert(ik <= ubound(this%phase, dim=2))


    call profiling_in("PHASE_APPLY_SINGLE")


    if (conjugate) then

      ! Apply the phase that contains both the k-point and vector-potential terms.

      do idim = 1, dim

        !$omp parallel do

        do ip = 1, np

          psi(ip, idim) = conjg(this%phase(ip, ik))*psi(ip, idim)

        end do

        !$omp end parallel do

      end do

    else

      ! Apply the conjugate of (i.e. remove) the phase that contains both the k-point and vector-potential terms.

      do idim = 1, dim

        !$omp parallel do

        do ip = 1, np

          psi(ip, idim) = this%phase(ip, ik)*psi(ip, idim)

        end do

        !$omp end parallel do

      end do

    end if


    call profiling_out("PHASE_APPLY_SINGLE")


    pop_sub(phase_apply_mf)

  end subroutine phase_apply_mf


  ! ---------------------------------------------------------------------------------------

  !

  subroutine phase_phase_spiral(this, der, psib)

    class(phase_t),         intent(in)    :: this

    type(derivatives_t),    intent(in)    :: der

    class(wfs_elec_t),      intent(inout) :: psib


    integer               :: ip, ii, sp

    integer, allocatable  :: spin_label(:)

    type(accel_mem_t)     :: spin_label_buffer

    integer(int64) :: bsize

    integer(int64), dimension(2) :: np, gsizes, bsizes


    push_sub(phase_phase_spiral)

    call profiling_in("PBC_PHASE_SPIRAL")


    call profiling_count_operations(6*(der%mesh%np_part-der%mesh%np)*psib%nst_linear)


    assert(der%boundaries%spiral)

    assert(psib%type() == type_cmplx)


    sp = der%mesh%np

    if (der%mesh%parallel_in_domains) sp = der%mesh%np + der%mesh%pv%np_ghost


    select case (psib%status())

    case (batch_packed)


      !$omp parallel do private(ip, ii)

      do ip = sp + 1, der%mesh%np_part

        do ii = 1, psib%nst_linear, 2

          if (this%spin(3,psib%linear_to_ist(ii), psib%ik)>0) then

            psib%zff_pack(ii+1, ip) = psib%zff_pack(ii+1, ip)*this%phase_spiral(ip-sp, 1)

          else

            psib%zff_pack(ii, ip) = psib%zff_pack(ii, ip)*this%phase_spiral(ip-sp, 2)

          end if

        end do

      end do

      !$omp end parallel do


    case (batch_not_packed)


      !$omp parallel private(ii, ip)

      do ii = 1, psib%nst_linear, 2

        if (this%spin(3,psib%linear_to_ist(ii), psib%ik)>0) then

          !$omp do

          do ip = sp + 1, der%mesh%np_part

            psib%zff_linear(ip, ii+1) = psib%zff_linear(ip, ii+1)*this%phase_spiral(ip-sp, 1)

          end do

          !$omp end do nowait

        else

          !$omp do

          do ip = sp + 1, der%mesh%np_part

            psib%zff_linear(ip, ii) = psib%zff_linear(ip, ii)*this%phase_spiral(ip-sp, 2)

          end do

          !$omp end do nowait

        end if

      end do

      !$omp end parallel


    case (batch_device_packed)


      assert(accel_is_enabled())


      ! generate array of offsets for access of psib and phase_spiral:

      ! TODO: Move this to the routine where spin(:,:,:) is generated

      !       and also move the buffer to the GPU at this point to

      !       avoid unecessary latency here!


      safe_allocate(spin_label(1:psib%nst_linear))

      spin_label = 0

      do ii = 1, psib%nst_linear, 2

        if (this%spin(3, psib%linear_to_ist(ii), psib%ik) > 0) spin_label(ii)=1

      end do


      call accel_create_buffer(spin_label_buffer, accel_mem_read_only, type_integer, psib%nst_linear)

      call accel_write_buffer(spin_label_buffer, psib%nst_linear, spin_label)


      call accel_kernel_start_call(kernel_phase_spiral, 'phase_spiral.cu', 'phase_spiral_apply')


      call accel_set_kernel_arg(kernel_phase_spiral, 0, psib%nst)

      call accel_set_kernel_arg(kernel_phase_spiral, 1, sp)

      call accel_set_kernel_arg(kernel_phase_spiral, 2, der%mesh%np_part)

      call accel_set_kernel_arg(kernel_phase_spiral, 3, psib%ff_device)

      call accel_set_kernel_arg(kernel_phase_spiral, 4, log2(psib%pack_size(1)))

      call accel_set_kernel_arg(kernel_phase_spiral, 5, this%buff_phase_spiral)

      call accel_set_kernel_arg(kernel_phase_spiral, 6, spin_label_buffer)


      ! Compute the grid size

      bsize = accel_kernel_block_size(kernel_phase_spiral)/psib%pack_size(1)

      np = (/psib%pack_size(1)/2_int64, int(der%mesh%np_part - sp, int64)/)

      bsizes = (/psib%pack_size(1)/2, 2*bsize/)

      call accel_grid_size(np, bsizes, gsizes)


      call accel_kernel_run(kernel_phase_spiral, bsizes, gsizes)


      call accel_finish()


      call accel_free_buffer(spin_label_buffer)


      safe_deallocate_a(spin_label)


    end select


    call profiling_out("PBC_PHASE_SPIRAL")

    pop_sub(phase_phase_spiral)

  end subroutine phase_phase_spiral


  ! ---------------------------------------------------------------------------------------

  logical pure function phase_is_allocated(this)

    class(phase_t), intent(in) :: this


    phase_is_allocated = allocated(this%phase)

  end function phase_is_allocated


  !----------------------------------------------------------

  subroutine phase_copy_and_set_phase(phase, gr, kpt, psib, psib_with_phase)

    class(phase_t),           intent(in)  :: phase

    type(grid_t),             intent(in)  :: gr

    type(distributed_t),      intent(in)  :: kpt

    type(wfs_elec_t),         intent(in)  :: psib

    type(wfs_elec_t),         intent(out) :: psib_with_phase


    integer :: k_offset, n_boundary_points


    push_sub(phase_copy_and_set_phase)


    call psib%copy_to(psib_with_phase)

    if (phase%is_allocated()) then

      call phase%apply_to(gr, gr%np, conjugate = .false., psib = psib_with_phase, src = psib, async=.true.)

      ! apply phase correction while setting boundary -> memory needs to be

      ! accessed only once

      k_offset = psib%ik - kpt%start

      n_boundary_points = int(gr%np_part - gr%np)

      call boundaries_set(gr%der%boundaries, gr, psib_with_phase, phase_correction = phase%phase_corr(:, psib%ik), &

        buff_phase_corr = phase%buff_phase_corr, offset=k_offset*n_boundary_points, async=.true.)

    else

      call psib%copy_data_to(gr%np, psib_with_phase)

      call boundaries_set(gr%der%boundaries, gr, psib_with_phase)

    end if


    call psib_with_phase%do_pack(copy = .true.)


    pop_sub(phase_copy_and_set_phase)

  end subroutine phase_copy_and_set_phase


end module phase_oct_m


!! Local Variables:

!! mode: f90

!! coding: utf-8

!! End:

accel_oct_m::accel_create_buffer
Definition: accel.F90:300

accel_oct_m::accel_grid_size_extend_dim
Definition: accel.F90:292

accel_oct_m::accel_grid_size
Definition: accel.F90:288

accel_oct_m::accel_kernel_run
Definition: accel.F90:304

accel_oct_m::accel_read_buffer
Definition: accel.F90:337

accel_oct_m::accel_set_kernel_arg
Definition: accel.F90:354

accel_oct_m::accel_write_buffer
Definition: accel.F90:312

global_oct_m::optional_default
Definition: global.F90:289

math_oct_m::log2
Definition: math.F90:198

profiling_oct_m::profiling_count_operations
Definition: profiling.F90:202

exp
double exp(double __x) __attribute__((__nothrow__

sin
double sin(double __x) __attribute__((__nothrow__

cos
double cos(double __x) __attribute__((__nothrow__

accel_oct_m
Definition: accel.F90:120

accel_oct_m::accel_kernel_block_size
integer function, public accel_kernel_block_size(kernel)
Definition: accel.F90:1188

accel_oct_m::accel_free_buffer
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1005

accel_oct_m::accel_kernel_start_call
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1413

accel_oct_m::accel_finish
subroutine, public accel_finish()
Definition: accel.F90:1098

accel_oct_m::accel_detach_buffer
subroutine, public accel_detach_buffer(this)
Clear a buffer handle without freeing device memory.
Definition: accel.F90:1049

accel_oct_m::accel_mem_read_write
integer, parameter, public accel_mem_read_write
Definition: accel.F90:185

accel_oct_m::kernel_phase_spiral
type(accel_kernel_t), target, save, public kernel_phase_spiral
Definition: accel.F90:275

accel_oct_m::accel_is_enabled
pure logical function, public accel_is_enabled()
Definition: accel.F90:402

accel_oct_m::accel_mem_read_only
integer, parameter, public accel_mem_read_only
Definition: accel.F90:185

batch_oct_m
This module implements batches of mesh functions.
Definition: batch.F90:135

batch_oct_m::batch_not_packed
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
Definition: batch.F90:286

batch_oct_m::batch_device_packed
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
Definition: batch.F90:286

batch_oct_m::batch_packed
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
Definition: batch.F90:286

batch_ops_oct_m
This module implements common operations on batches of mesh functions.
Definition: batch_ops.F90:118

boundaries_oct_m
Module implementing boundary conditions in Octopus.
Definition: boundaries.F90:124

comm_oct_m
Definition: comm.F90:116

debug_oct_m
Definition: debug.F90:116

derivatives_oct_m
This module calculates the derivatives (gradients, Laplacians, etc.) of a function.
Definition: derivatives.F90:123

distributed_oct_m
Definition: distributed.F90:116

electron_space_oct_m
Definition: electron_space.F90:116

global_oct_m
Definition: global.F90:116

global_oct_m::m_zero
real(real64), parameter, public m_zero
Definition: global.F90:191

global_oct_m::m_zi
complex(real64), parameter, public m_zi
Definition: global.F90:205

global_oct_m::m_one
real(real64), parameter, public m_one
Definition: global.F90:192

grid_oct_m
This module implements the underlying real-space grid.
Definition: grid.F90:119

hardware_oct_m
Definition: hardware.F90:24

kpoints_oct_m
Definition: kpoints.F90:116

math_oct_m
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117

mesh_oct_m
This module defines the meshes, which are used in Octopus.
Definition: mesh.F90:120

mesh_oct_m::mesh_periodic_point
integer(int64) function, public mesh_periodic_point(mesh, space, ip)
This function returns the point inside the grid corresponding to a boundary point when PBCs are used....
Definition: mesh.F90:724

mesh_oct_m::mesh_x_global
real(real64) function, dimension(1:mesh%box%dim), public mesh_x_global(mesh, ipg)
Given a global point index, this function returns the coordinates of the point.
Definition: mesh.F90:817

messages_oct_m
Definition: messages.F90:117

mpi_oct_m
Definition: mpi.F90:116

phase_oct_m
Definition: phase.F90:117

phase_oct_m::phase_phase_spiral
subroutine phase_phase_spiral(this, der, psib)
apply spiral phase
Definition: phase.F90:805

phase_oct_m::phase_unset_phase_corr
subroutine phase_unset_phase_corr(phase, mesh, psib, async)
unset the phase correction (if necessary)
Definition: phase.F90:566

phase_oct_m::phase_accel_rebuild
subroutine, public phase_accel_rebuild(phase, mesh, kpt)
Rebuild phase accelerator buffers after an intrinsic copy.
Definition: phase.F90:492

phase_oct_m::phase_copy_and_set_phase
subroutine phase_copy_and_set_phase(phase, gr, kpt, psib, psib_with_phase)
Copy a batch to another batch and apply the Bloch phase to it.
Definition: phase.F90:926

phase_oct_m::phase_init_phases
subroutine phase_init_phases(phase, gr, kpt, kpoints, d, space)
Initiliaze the phase arrays and copy to GPU the data.
Definition: phase.F90:224

phase_oct_m::phase_end
subroutine phase_end(phase)
Releases the memory of the phase object.
Definition: phase.F90:469

phase_oct_m::phase_update_phases
subroutine phase_update_phases(phase, mesh, kpt, kpoints, d, space, uniform_vector_potential)
Update the phases.
Definition: phase.F90:335

phase_oct_m::phase_is_allocated
logical pure function phase_is_allocated(this)
Definition: phase.F90:913

phase_oct_m::phase_apply_batch
subroutine phase_apply_batch(this, mesh, np, conjugate, psib, src, async)
apply (remove) the phase to the wave functions before (after) applying the Hamiltonian
Definition: phase.F90:591

phase_oct_m::phase_set_phase_corr
subroutine phase_set_phase_corr(phase, mesh, psib, async)
set the phase correction (if necessary)
Definition: phase.F90:540

phase_oct_m::phase_apply_mf
subroutine phase_apply_mf(this, psi, np, dim, ik, conjugate)
apply (or remove) the phase to a wave function psi
Definition: phase.F90:759

profiling_oct_m
Definition: profiling.F90:118

profiling_oct_m::profiling_out
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:631

profiling_oct_m::profiling_in
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:554

space_oct_m
Definition: space.F90:116

states_elec_dim_oct_m
This module handles spin dimensions of the states and the k-point distribution.
Definition: states_elec_dim.F90:122

states_elec_oct_m
Definition: states_elec.F90:115

submesh_oct_m
Definition: submesh.F90:116

types_oct_m
Definition: types.F90:116

types_oct_m::type_float
type(type_t), public type_float
Definition: types.F90:135

types_oct_m::type_cmplx
type(type_t), public type_cmplx
Definition: types.F90:136

types_oct_m::type_integer
type(type_t), public type_integer
Definition: types.F90:137

wfs_elec_oct_m
Definition: wfs_elec.F90:116

accel_oct_m::accel_kernel_t
Definition: accel.F90:237

accel_oct_m::accel_mem_t
Definition: accel.F90:228

derivatives_oct_m::derivatives_t
class representing derivatives
Definition: derivatives.F90:220

distributed_oct_m::distributed_t
Distribution of N instances over mpi_grpsize processes, for the local rank mpi_grprank....
Definition: distributed.F90:150

grid_oct_m::grid_t
Description of the grid, containing information on derivatives, stencil, and symmetries.
Definition: grid.F90:171

kpoints_oct_m::kpoints_t
Definition: kpoints.F90:176

mesh_oct_m::mesh_t
Describes mesh distribution to nodes.
Definition: mesh.F90:187

phase_oct_m::phase_t
A container for the phase.
Definition: phase.F90:181

space_oct_m::space_t
Definition: space.F90:132

states_elec_dim_oct_m::states_elec_dim_t
class for organizing spins and k-points
Definition: states_elec_dim.F90:152

wfs_elec_oct_m::wfs_elec_t
batches of electronic states
Definition: wfs_elec.F90:141

true
int true(void)
Definition: symmetries_finite.c:3150