main/doxygen_doc/batch__ops_8F90_source.html

!! Copyright (C) 2008 X. Andrade

!!

!! This program is free software; you can redistribute it and/or modify

!! it under the terms of the GNU General Public License as published by

!! the Free Software Foundation; either version 2, or (at your option)

!! any later version.

!!

!! This program is distributed in the hope that it will be useful,

!! but WITHOUT ANY WARRANTY; without even the implied warranty of

!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!! GNU General Public License for more details.

!!

!! You should have received a copy of the GNU General Public License

!! along with this program; if not, write to the Free Software

!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA

!! 02110-1301, USA.

!!


#include "global.h"


module batch_ops_oct_m

  use accel_oct_m

  use batch_oct_m

  use blas_oct_m

  use debug_oct_m

  use iso_c_binding

  use global_oct_m

  use lalg_basic_oct_m

  use math_oct_m

  use messages_oct_m

  use profiling_oct_m

  use types_oct_m


  implicit none


  private

  public ::                         &

    batch_set_zero,                 &

    batch_axpy,                     &

    batch_axpby,                    &

    batch_scal,                     &

    batch_scal2v,                   &

    batch_xpay,                     &

    batch_set_state,                &

    batch_get_state,                &

    batch_get_points,               &

    batch_set_points,               &

    batch_points_block_size,        &

    batch_pointwise_mul,            &

    batch_mul_mf,                   &

    batch_add_with_map,             &

    batch_copy_with_map,            &

    dbatch_axpy_function,           &

    zbatch_axpy_function,           &

    dbatch_ax_function_py,          &

    zbatch_ax_function_py,          &

    dbatch_copy_with_map_to_array,  &

    zbatch_copy_with_map_to_array,  &

    batch_split_complex


  interface batch_axpy

    module procedure dbatch_axpy_const

    module procedure zbatch_axpy_const

    module procedure dbatch_axpy_vec

    module procedure zbatch_axpy_vec

  end interface batch_axpy


  interface batch_scal

    module procedure dbatch_scal_const

    module procedure zbatch_scal_const

    module procedure dbatch_scal_vec

    module procedure zbatch_scal_vec

  end interface batch_scal


  interface batch_axpby

    module procedure dbatch_axpby

    module procedure zbatch_axpby

  end interface batch_axpby


  interface batch_scal2v

    module procedure dbatch_scal2v

    module procedure zbatch_scal2v

  end interface batch_scal2v


  interface batch_xpay

    module procedure dbatch_xpay_vec

    module procedure zbatch_xpay_vec

    module procedure dbatch_xpay_const

    module procedure zbatch_xpay_const

  end interface batch_xpay


  interface batch_add_with_map

    module procedure batch_add_with_map_cpu

    module procedure batch_add_with_map_accel

  end interface batch_add_with_map


  interface batch_copy_with_map

    module procedure batch_copy_with_map_cpu

    module procedure batch_copy_with_map_accel

  end interface batch_copy_with_map


  interface batch_set_state

    module procedure dbatch_set_state1

    module procedure zbatch_set_state1

    module procedure dbatch_set_state2

    module procedure zbatch_set_state2

    module procedure dbatch_set_state3

    module procedure zbatch_set_state3

  end interface batch_set_state


  interface batch_get_state

    module procedure dbatch_get_state1

    module procedure zbatch_get_state1

    module procedure dbatch_get_state2

    module procedure zbatch_get_state2

    module procedure dbatch_get_state3

    module procedure zbatch_get_state3

  end interface batch_get_state


  interface batch_get_points

    module procedure dbatch_get_points

    module procedure zbatch_get_points

    module procedure batch_get_points_accel

  end interface batch_get_points


  interface batch_set_points

    module procedure dbatch_set_points

    module procedure zbatch_set_points

    module procedure batch_set_points_accel

  end interface batch_set_points


  interface batch_pointwise_mul

    module procedure batch_mul_cj

  end interface batch_pointwise_mul


  interface batch_mul_mf

    module procedure dbatch_mul_mf

    module procedure zbatch_mul_mf

  end interface batch_mul_mf


contains


  !--------------------------------------------------------------

  subroutine batch_set_zero(this, np, async)

    class(batch_t),     intent(inout) :: this

    integer, optional,  intent(in)    :: np

    logical, optional,  intent(in)    :: async


    integer :: ist_linear, ist, ip, np_


    push_sub(batch_set_zero)


    assert(not_in_openmp())


    call profiling_in("BATCH_SET_ZERO")


    select case (this%status())

    case (batch_device_packed)

      np_ = optional_default(np, int(this%pack_size(2), int32))

      assert(np_ <= int(this%pack_size(2), int32))

      call accel_set_buffer_to_zero(this%ff_device, this%type(), (int(this%pack_size(1), int32) * np_), async=async)


    case (batch_packed)

      np_ = optional_default(np, int(this%pack_size(2), int32))

      assert(np_ <= int(this%pack_size(2), int32))

      if (this%type() == type_float) then

        !$omp parallel do private(ist) schedule(static)

        do ip = 1, np_

          !$omp simd

          do ist = 1, int(this%pack_size(1), int32)

            this%dff_pack(ist, ip) = m_zero

          end do

        end do

      else

        !$omp parallel do private(ist) schedule(static)

        do ip = 1, np_

          !$omp simd

          do ist = 1, int(this%pack_size(1), int32)

            this%zff_pack(ist, ip) = m_z0

          end do

        end do

      end if


    case (batch_not_packed)

      if (this%type() == type_float) then

        np_ = optional_default(np, ubound(this%dff_linear, dim=1))

        assert(np_ <= ubound(this%dff_linear, dim=1))

        do ist_linear = 1, this%nst_linear

          !$omp parallel do schedule(static)

          do ip = 1, np_

            this%dff_linear(ip, ist_linear) = m_zero

          end do

        end do

      else

        np_ = optional_default(np, ubound(this%zff_linear, dim=1))

        assert(np_ <= ubound(this%zff_linear, dim=1))

        do ist_linear = 1, this%nst_linear

          !$omp parallel do schedule(static)

          do ip = 1, np_

            this%zff_linear(ip, ist_linear) = m_z0

          end do

        end do

      end if


    case default

      message(1) = "batch_set_zero: unknown batch status."

      call messages_fatal(1)


    end select


    call profiling_out("BATCH_SET_ZERO")


    pop_sub(batch_set_zero)

  end subroutine batch_set_zero


  ! --------------------------------------------------------------

  !

  subroutine batch_get_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)

    class(batch_t),      intent(in)    :: this

    integer,             intent(in)    :: sp

    integer,             intent(in)    :: ep

    type(accel_mem_t),   intent(inout) :: psi

    integer,             intent(in)    :: ldpsi1

    integer,             intent(in)    :: ldpsi2


    integer :: tsize, ii, it

    type(accel_kernel_t), save :: kernel

    integer, allocatable :: linear_to_ist(:), linear_to_idim(:)

    type(accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim


    push_sub(batch_get_points_accel)

    call profiling_in("GET_POINTS")


    select case (this%status())

    case (batch_not_packed, batch_packed)

      call messages_not_implemented('batch_get_points_accel for non-CL batches')


    case (batch_device_packed)


      tsize = types_get_size(this%type())/types_get_size(type_float)

      safe_allocate(linear_to_ist(1:this%nst_linear*tsize))

      safe_allocate(linear_to_idim(1:this%nst_linear*tsize))

      do ii = 1, this%nst_linear

        do it = 1, tsize

          linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1

          linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1

        end do

      end do


      call accel_create_buffer(buff_linear_to_ist, accel_mem_read_only, type_integer, this%nst_linear*tsize)

      call accel_write_buffer(buff_linear_to_ist, this%nst_linear*tsize, linear_to_ist)

      call accel_create_buffer(buff_linear_to_idim, accel_mem_read_only, type_integer, this%nst_linear*tsize)

      call accel_write_buffer(buff_linear_to_idim, this%nst_linear*tsize, linear_to_idim)


      call accel_kernel_start_call(kernel, 'points.cu', 'get_points')


      call accel_set_kernel_arg(kernel, 0, sp)

      call accel_set_kernel_arg(kernel, 1, ep)

      call accel_set_kernel_arg(kernel, 2, buff_linear_to_ist)

      call accel_set_kernel_arg(kernel, 3, buff_linear_to_idim)

      call accel_set_kernel_arg(kernel, 4, this%nst_linear*tsize)

      call accel_set_kernel_arg(kernel, 5, this%ff_device)

      call accel_set_kernel_arg(kernel, 6, int(this%pack_size_real(1), int32))

      call accel_set_kernel_arg(kernel, 7, psi)

      call accel_set_kernel_arg(kernel, 8, ldpsi1*tsize)

      call accel_set_kernel_arg(kernel, 9, ldpsi2)


      call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))


      call accel_free_buffer(buff_linear_to_ist)

      call accel_free_buffer(buff_linear_to_idim)

      safe_deallocate_a(linear_to_ist)

      safe_deallocate_a(linear_to_idim)


    end select


    call profiling_out("GET_POINTS")


    pop_sub(batch_get_points_accel)

  end subroutine batch_get_points_accel


  ! --------------------------------------------------------------

  !

  subroutine batch_set_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)

    class(batch_t),      intent(inout) :: this

    integer,             intent(in)    :: sp

    integer,             intent(in)    :: ep

    type(accel_mem_t),   intent(in)    :: psi

    integer,             intent(in)    :: ldpsi1

    integer,             intent(in)    :: ldpsi2


    integer :: tsize, ii, it

    type(accel_kernel_t), save :: kernel

    integer, allocatable :: linear_to_ist(:), linear_to_idim(:)

    type(accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim


    push_sub(batch_set_points_accel)

    call profiling_in("SET_POINTS")


    select case (this%status())

    case (batch_not_packed, batch_packed)

      call messages_not_implemented('batch_set_points_accel for non-CL batches')


    case (batch_device_packed)


      tsize = types_get_size(this%type())/types_get_size(type_float)

      safe_allocate(linear_to_ist(1:this%nst_linear*tsize))

      safe_allocate(linear_to_idim(1:this%nst_linear*tsize))

      do ii = 1, this%nst_linear

        do it = 1, tsize

          linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1

          linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1

        end do

      end do


      call accel_create_buffer(buff_linear_to_ist, accel_mem_read_only, type_integer, this%nst_linear*tsize)

      call accel_write_buffer(buff_linear_to_ist, this%nst_linear*tsize, linear_to_ist)

      call accel_create_buffer(buff_linear_to_idim, accel_mem_read_only, type_integer, this%nst_linear*tsize)

      call accel_write_buffer(buff_linear_to_idim, this%nst_linear*tsize, linear_to_idim)


      call accel_kernel_start_call(kernel, 'points.cu', 'set_points')


      call accel_set_kernel_arg(kernel, 0, sp)

      call accel_set_kernel_arg(kernel, 1, ep)

      call accel_set_kernel_arg(kernel, 2, buff_linear_to_ist)

      call accel_set_kernel_arg(kernel, 3, buff_linear_to_idim)

      call accel_set_kernel_arg(kernel, 4, this%nst_linear*tsize)

      call accel_set_kernel_arg(kernel, 5, psi)

      call accel_set_kernel_arg(kernel, 6, ldpsi1*tsize)

      call accel_set_kernel_arg(kernel, 7, ldpsi2)

      call accel_set_kernel_arg(kernel, 8, this%ff_device)

      call accel_set_kernel_arg(kernel, 9, int(this%pack_size_real(1), int32))


      call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))


      call accel_free_buffer(buff_linear_to_ist)

      call accel_free_buffer(buff_linear_to_idim)

      safe_deallocate_a(linear_to_ist)

      safe_deallocate_a(linear_to_idim)


    end select


    call profiling_out("SET_POINTS")


    pop_sub(batch_set_points_accel)

  end subroutine batch_set_points_accel


  ! -------------------------

  !

  integer pure function batch_points_block_size() result(block_size)


    block_size = 61440


  end function batch_points_block_size


! -------------------------

  subroutine batch_mul_cj(np, xx, yy, zz, conjugate_yy)

    integer,           intent(in)    :: np

    class(batch_t),    intent(in)    :: xx

    class(batch_t),    intent(in)    :: yy

    class(batch_t),    intent(inout) :: zz

    logical, optional, intent(in)    :: conjugate_yy


    integer :: ii, ip

    logical :: conj_yy

    integer(int64), dimension(3) :: gsizes, bsizes

    type(accel_kernel_t), save, target :: kernel_zmul_conj

    type(accel_kernel_t), save, target :: kernel_zmul

    type(accel_kernel_t), save, target :: kernel_dmul

    type(accel_kernel_t), pointer :: kernel

    push_sub(batch_mul_cj)


    call xx%check_compatibility_with(yy)

    call xx%check_compatibility_with(zz)


    conj_yy = optional_default(conjugate_yy, .true.)


    select case (xx%status())

    case (batch_not_packed)

      if (xx%type() == type_cmplx) then

        if (conj_yy) then

          !$omp parallel private(ii, ip)

          do ii = 1, xx%nst_linear

            !$omp do schedule(static)

            do ip = 1, np

              zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*conjg(yy%zff_linear(ip, ii))

            end do

            !$omp end do

          end do

          !$omp end parallel

        else

          !$omp parallel private(ii, ip)

          do ii = 1, xx%nst_linear

            !$omp do schedule(static)

            do ip = 1, np

              zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*yy%zff_linear(ip, ii)

            end do

            !$omp end do

          end do

          !$omp end parallel

        end if

      else

        !$omp parallel private(ii, ip)

        do ii = 1, xx%nst_linear

          !$omp do schedule(static)

          do ip = 1, np

            zz%dff_linear(ip, ii) = xx%dff_linear(ip, ii)*yy%dff_linear(ip, ii)

          end do

          !$omp end do

        end do

        !$omp end parallel

      end if


    case (batch_packed)

      if (xx%type() == type_cmplx) then

        if (conj_yy) then

          !$omp parallel do private(ii)

          do ip = 1, np

            !$omp simd

            do ii = 1, xx%nst_linear

              zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*conjg(yy%zff_pack(ii, ip))

            end do

          end do

        else

          !$omp parallel do private(ii)

          do ip = 1, np

            !$omp simd

            do ii = 1, xx%nst_linear

              zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*yy%zff_pack(ii, ip)

            end do

          end do

        end if

      else

        !$omp parallel do private(ii)

        do ip = 1, np

          !$omp simd

          do ii = 1, xx%nst_linear

            zz%dff_pack(ii, ip) = xx%dff_pack(ii, ip)*yy%dff_pack(ii, ip)

          end do

        end do

      end if


    case (batch_device_packed)

      if (xx%type() == type_cmplx) then

        if (conj_yy) then

          kernel => kernel_zmul_conj

          call accel_kernel_start_call(kernel, 'batch_mul.cu', 'zmul_conj')

        else

          kernel => kernel_zmul

          call accel_kernel_start_call(kernel, 'batch_mul.cu', 'zmul')

        end if


        call accel_set_kernel_arg(kernel, 0, np)

        call accel_set_kernel_arg(kernel, 1, xx%ff_device)

        call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size(1), int32)))

        call accel_set_kernel_arg(kernel, 3, yy%ff_device)

        call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size(1), int32)))

        call accel_set_kernel_arg(kernel, 5, zz%ff_device)

        call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size(1), int32)))


        call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)

      else

        kernel => kernel_dmul

        call accel_kernel_start_call(kernel, 'batch_mul.cu', 'dmul')


        call accel_set_kernel_arg(kernel, 0, np)

        call accel_set_kernel_arg(kernel, 1, xx%ff_device)

        call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size_real(1), int32)))

        call accel_set_kernel_arg(kernel, 3, yy%ff_device)

        call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size_real(1), int32)))

        call accel_set_kernel_arg(kernel, 5, zz%ff_device)

        call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size_real(1), int32)))


        call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)

      end if


      call accel_kernel_run(kernel, gsizes, bsizes)

    end select


    pop_sub(batch_mul_cj)

  end subroutine batch_mul_cj


! -------------------------

  subroutine batch_add_with_map_cpu(np, map, xx, yy, zz)

    integer,           intent(in)    :: np

    integer,           intent(in)    :: map(:)

    class(batch_t),    intent(in)    :: xx

    class(batch_t),    intent(in)    :: yy

    class(batch_t),    intent(inout) :: zz

    type(accel_mem_t) :: buff_map


    push_sub(batch_add_with_map_cpu)


    if (xx%status() /= batch_device_packed) then

      if (xx%type() == type_float) then

        call dbatch_add_with_map(np, map, xx, yy, zz)

      else

        call zbatch_add_with_map(np, map, xx, yy, zz)

      end if

    else

      ! copy map to GPU if not already there

      call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)

      call accel_write_buffer(buff_map, np, map)

      call batch_add_with_map_accel(np, buff_map, xx, yy, zz)

      call accel_free_buffer(buff_map)

    end if


    pop_sub(batch_add_with_map_cpu)

  end subroutine batch_add_with_map_cpu


! -------------------------

  subroutine batch_add_with_map_accel(np, map, xx, yy, zz)

    integer,            intent(in)    :: np

    class(accel_mem_t), intent(in)    :: map

    class(batch_t),     intent(in)    :: xx

    class(batch_t),     intent(in)    :: yy

    class(batch_t),     intent(inout) :: zz


    type(accel_kernel_t), save :: kernel

    integer(int64), dimension(3) :: gsizes, bsizes


    push_sub(batch_add_with_map_accel)


    call accel_kernel_start_call(kernel, 'copy.cu', 'add_with_map')


    call accel_set_kernel_arg(kernel, 0, np)

    call accel_set_kernel_arg(kernel, 1, map)

    call accel_set_kernel_arg(kernel, 2, xx%ff_device)

    call accel_set_kernel_arg(kernel, 3, log2(int(xx%pack_size_real(1), int32)))

    call accel_set_kernel_arg(kernel, 4, yy%ff_device)

    call accel_set_kernel_arg(kernel, 5, log2(int(yy%pack_size_real(1), int32)))

    call accel_set_kernel_arg(kernel, 6, zz%ff_device)

    call accel_set_kernel_arg(kernel, 7, log2(int(zz%pack_size_real(1), int32)))


    ! Compute the grid (extend to another dimensions if the size of the problem is too big)

    call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)


    call accel_kernel_run(kernel, gsizes, bsizes)


    pop_sub(batch_add_with_map_accel)

  end subroutine batch_add_with_map_accel


! -------------------------

  subroutine batch_copy_with_map_cpu(np, map, xx, yy)

    integer,           intent(in)    :: np

    integer,           intent(in)    :: map(:)

    class(batch_t),    intent(in)    :: xx

    class(batch_t),    intent(inout) :: yy

    type(accel_mem_t) :: buff_map


    push_sub(batch_copy_with_map_cpu)


    if (xx%status() /= batch_device_packed) then

      if (xx%type() == type_float) then

        call dbatch_copy_with_map(np, map, xx, yy)

      else

        call zbatch_copy_with_map(np, map, xx, yy)

      end if

    else

      ! copy map to GPU if not already there

      call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)

      call accel_write_buffer(buff_map, np, map)

      call batch_copy_with_map_accel(np, buff_map, xx, yy)

      call accel_free_buffer(buff_map)

    end if


    pop_sub(batch_copy_with_map_cpu)

  end subroutine batch_copy_with_map_cpu


! -------------------------

  subroutine batch_copy_with_map_accel(np, map, xx, yy)

    integer,            intent(in)    :: np

    class(accel_mem_t), intent(in)    :: map

    class(batch_t),     intent(in)    :: xx

    class(batch_t),     intent(inout) :: yy


    type(accel_kernel_t), save :: kernel

    integer(int64), dimension(3) :: gsizes, bsizes


    push_sub(batch_copy_with_map_accel)


    call accel_kernel_start_call(kernel, 'copy.cu', 'copy_with_map')


    ! execute only if map has at least one element

    if (np > 0) then

      call accel_set_kernel_arg(kernel, 0, np)

      call accel_set_kernel_arg(kernel, 1, map)

      call accel_set_kernel_arg(kernel, 2, xx%ff_device)

      call accel_set_kernel_arg(kernel, 3, log2(int(xx%pack_size_real(1), int32)))

      call accel_set_kernel_arg(kernel, 4, yy%ff_device)

      call accel_set_kernel_arg(kernel, 5, log2(int(yy%pack_size_real(1), int32)))


      ! Compute the grid (extend to another dimensions if the size of the problem is too big)

      call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)


      call accel_kernel_run(kernel, gsizes, bsizes)

    end if


    pop_sub(batch_copy_with_map_accel)

  end subroutine batch_copy_with_map_accel


  ! -------------------------

  !

  subroutine batch_split_complex(np, xx, yy, zz)

    integer,           intent(in)    :: np

    class(batch_t),    intent(in)    :: xx

    class(batch_t),    intent(inout) :: yy

    class(batch_t),    intent(inout) :: zz


    integer :: ist_linear, ip

    type(accel_kernel_t), save :: kernel

    integer(int64), dimension(3) :: gsizes, bsizes


    push_sub(batch_split_complex)


    assert(xx%type() == type_cmplx)

    assert(yy%type() == type_float)

    assert(zz%type() == type_float)

    assert(xx%status() == yy%status())

    assert(xx%status() == zz%status())


    select case (xx%status())

    case (batch_not_packed)

      do ist_linear = 1, xx%nst_linear

        !$omp parallel do schedule(static)

        do ip = 1, np

          yy%dff_linear(ip, ist_linear) = real(xx%zff_linear(ip, ist_linear), real64)

          zz%dff_linear(ip, ist_linear) = aimag(xx%zff_linear(ip, ist_linear))

        end do

      end do

    case (batch_packed)

      !$omp parallel do private(ist_linear) schedule(static)

      do ip = 1, np

        do ist_linear = 1, xx%nst_linear

          yy%dff_pack(ist_linear, ip) = real(xx%zff_pack(ist_linear, ip), real64)

          zz%dff_pack(ist_linear, ip) = aimag(xx%zff_pack(ist_linear, ip))

        end do

      end do

    case (batch_device_packed)

      call accel_kernel_start_call(kernel, 'split.cu', 'split_complex')


      call accel_set_kernel_arg(kernel, 0, int(xx%pack_size(2), int32))

      call accel_set_kernel_arg(kernel, 1, xx%ff_device)

      call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size(1), int32)))

      call accel_set_kernel_arg(kernel, 3, yy%ff_device)

      call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size(1), int32)))

      call accel_set_kernel_arg(kernel, 5, zz%ff_device)

      call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size(1), int32)))


      ! Compute the grid (extend to another dimensions if the size of the problem is too big)

      call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)


      call accel_kernel_run(kernel, gsizes, bsizes)

    end select


    pop_sub(batch_split_complex)

  end subroutine batch_split_complex


#undef SPECIALIZED

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"


! Specialized kernels for nst_linear=2

#define SPECIALIZED 2

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"

#undef SPECIALIZED


! Specialized kernels for nst_linear=3

#define SPECIALIZED 3

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"

#undef SPECIALIZED


! Specialized kernels for nst_linear=4

#define SPECIALIZED 4

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"

#undef SPECIALIZED


! Specialized kernels for nst_linear=6

#define SPECIALIZED 6

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"

#undef SPECIALIZED


! Specialized kernels for nst_linear=8

#define SPECIALIZED 8

#include "undef.F90"

#include "real.F90"

#include "batch_ops_inc.F90"


#include "undef.F90"

#include "complex.F90"

#include "batch_ops_inc.F90"

#undef SPECIALIZED


end module batch_ops_oct_m


!! Local Variables:

!! mode: f90

!! coding: utf-8

!! End:

accel_oct_m::accel_create_buffer
Definition: accel.F90:301

accel_oct_m::accel_kernel_run
Definition: accel.F90:305

accel_oct_m::accel_set_buffer_to_zero
Definition: accel.F90:309

accel_oct_m::accel_set_kernel_arg
Definition: accel.F90:355

accel_oct_m::accel_write_buffer
Definition: accel.F90:313

batch_ops_oct_m::batch_add_with_map
Definition: batch_ops.F90:194

batch_ops_oct_m::batch_axpby
batchified axpby:
Definition: batch_ops.F90:175

batch_ops_oct_m::batch_axpy
batchified version of the BLAS axpy routine:
Definition: batch_ops.F90:159

batch_ops_oct_m::batch_copy_with_map
Definition: batch_ops.F90:199

batch_ops_oct_m::batch_get_points
Definition: batch_ops.F90:236

batch_ops_oct_m::batch_get_state
Definition: batch_ops.F90:227

batch_ops_oct_m::batch_mul_mf
batchified multiplication by mesh function with optional conjugation:
Definition: batch_ops.F90:254

batch_ops_oct_m::batch_pointwise_mul
Definition: batch_ops.F90:248

batch_ops_oct_m::batch_scal2v
batchified scale with optional conjugation:
Definition: batch_ops.F90:181

batch_ops_oct_m::batch_scal
scale a batch by a constant or vector
Definition: batch_ops.F90:167

batch_ops_oct_m::batch_set_points
Definition: batch_ops.F90:242

batch_ops_oct_m::batch_set_state
There are several ways how to call batch_set_state and batch_get_state:
Definition: batch_ops.F90:218

batch_ops_oct_m::batch_xpay
batchified version of
Definition: batch_ops.F90:187

global_oct_m::optional_default
Definition: global.F90:299

log2
double log2(double __x) __attribute__((__nothrow__

accel_oct_m
Definition: accel.F90:120

accel_oct_m::accel_free_buffer
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1006

accel_oct_m::accel_kernel_start_call
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1439

accel_oct_m::accel_mem_read_only
integer, parameter, public accel_mem_read_only
Definition: accel.F90:186

batch_oct_m
This module implements batches of mesh functions.
Definition: batch.F90:135

batch_oct_m::batch_not_packed
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
Definition: batch.F90:287

batch_oct_m::batch_device_packed
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
Definition: batch.F90:287

batch_oct_m::batch_packed
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
Definition: batch.F90:287

batch_ops_oct_m
This module implements common operations on batches of mesh functions.
Definition: batch_ops.F90:118

batch_ops_oct_m::zbatch_get_state3
subroutine zbatch_get_state3(this, ii, np, psi, async)
Definition: batch_ops.F90:4102

batch_ops_oct_m::dbatch_get_state3
subroutine dbatch_get_state3(this, ii, np, psi, async)
Definition: batch_ops.F90:2339

batch_ops_oct_m::batch_copy_with_map_accel
subroutine batch_copy_with_map_accel(np, map, xx, yy)
Definition: batch_ops.F90:700

batch_ops_oct_m::zbatch_get_state1
subroutine zbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
Definition: batch_ops.F90:3950

batch_ops_oct_m::zbatch_copy_with_map_to_array
subroutine, public zbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
Definition: batch_ops.F90:4374

batch_ops_oct_m::dbatch_ax_function_py
subroutine, public dbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
Definition: batch_ops.F90:1203

batch_ops_oct_m::dbatch_set_state1
subroutine dbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
Definition: batch_ops.F90:2049

batch_ops_oct_m::dbatch_axpy_const
subroutine dbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:864

batch_ops_oct_m::zbatch_axpby
subroutine zbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
Definition: batch_ops.F90:3493

batch_ops_oct_m::zbatch_get_points
subroutine zbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:4123

batch_ops_oct_m::dbatch_xpay_vec
subroutine dbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
Definition: batch_ops.F90:1460

batch_ops_oct_m::dbatch_scal_vec
subroutine dbatch_scal_vec(np, aa, xx, a_start, a_full)
Definition: batch_ops.F90:2680

batch_ops_oct_m::dbatch_set_state2
subroutine dbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
Definition: batch_ops.F90:2162

batch_ops_oct_m::dbatch_copy_with_map
subroutine dbatch_copy_with_map(np, map, xx, yy)
Definition: batch_ops.F90:2567

batch_ops_oct_m::dbatch_copy_with_map_to_array
subroutine, public dbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
Definition: batch_ops.F90:2610

batch_ops_oct_m::dbatch_scal2v
subroutine dbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:1753

batch_ops_oct_m::dbatch_get_state2
subroutine dbatch_get_state2(this, index, np, psi, async)
Definition: batch_ops.F90:2321

batch_ops_oct_m::dbatch_axpy_vec
subroutine dbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
A simple switch between specialized kernels and generic kernels.
Definition: batch_ops.F90:2656

batch_ops_oct_m::dbatch_set_state3
subroutine dbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
Definition: batch_ops.F90:2180

batch_ops_oct_m::zbatch_axpy_const
subroutine zbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:2782

batch_ops_oct_m::zbatch_scal_const
subroutine zbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
Definition: batch_ops.F90:3188

batch_ops_oct_m::zbatch_get_state2
subroutine zbatch_get_state2(this, index, np, psi, async)
Definition: batch_ops.F90:4084

batch_ops_oct_m::zbatch_xpay_const
subroutine zbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
Definition: batch_ops.F90:3462

batch_ops_oct_m::batch_split_complex
subroutine, public batch_split_complex(np, xx, yy, zz)
extract the real and imaginary parts of a complex batch
Definition: batch_ops.F90:736

batch_ops_oct_m::batch_add_with_map_accel
subroutine batch_add_with_map_accel(np, map, xx, yy, zz)
Definition: batch_ops.F90:641

batch_ops_oct_m::dbatch_set_points
subroutine dbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:2440

batch_ops_oct_m::dbatch_get_state1
subroutine dbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
Definition: batch_ops.F90:2200

batch_ops_oct_m::zbatch_copy_with_map
subroutine zbatch_copy_with_map(np, map, xx, yy)
Definition: batch_ops.F90:4331

batch_ops_oct_m::dbatch_mul_mf
subroutine dbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:1897

batch_ops_oct_m::batch_set_zero
subroutine, public batch_set_zero(this, np, async)
fill all mesh functions of the batch with zero
Definition: batch_ops.F90:265

batch_ops_oct_m::zbatch_ax_function_py
subroutine, public zbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
Definition: batch_ops.F90:3104

batch_ops_oct_m::zbatch_axpy_vec
subroutine zbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
A simple switch between specialized kernels and generic kernels.
Definition: batch_ops.F90:4420

batch_ops_oct_m::batch_set_points_accel
subroutine batch_set_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_set_points.
Definition: batch_ops.F90:408

batch_ops_oct_m::dbatch_xpay_const
subroutine dbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
Definition: batch_ops.F90:1603

batch_ops_oct_m::dbatch_axpby
subroutine dbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
Definition: batch_ops.F90:1634

batch_ops_oct_m::dbatch_get_points
subroutine dbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:2360

batch_ops_oct_m::zbatch_scal2v
subroutine zbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:3580

batch_ops_oct_m::batch_points_block_size
integer pure function, public batch_points_block_size()
determine the device block size
Definition: batch_ops.F90:477

batch_ops_oct_m::dbatch_add_with_map
subroutine dbatch_add_with_map(np, map, xx, yy, zz)
Definition: batch_ops.F90:2525

batch_ops_oct_m::zbatch_mul_mf
subroutine zbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:3693

batch_ops_oct_m::zbatch_xpay_vec
subroutine zbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
Definition: batch_ops.F90:3344

batch_ops_oct_m::batch_mul_cj
subroutine batch_mul_cj(np, xx, yy, zz, conjugate_yy)
Point-wise multiply two batches with optional conjugation on yy: zz_i = xx_i * CJ(yy_i)
Definition: batch_ops.F90:486

batch_ops_oct_m::zbatch_set_state1
subroutine zbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
Definition: batch_ops.F90:3812

batch_ops_oct_m::batch_add_with_map_cpu
subroutine batch_add_with_map_cpu(np, map, xx, yy, zz)
Definition: batch_ops.F90:613

batch_ops_oct_m::zbatch_scal_vec
subroutine zbatch_scal_vec(np, aa, xx, a_start, a_full)
Definition: batch_ops.F90:4444

batch_ops_oct_m::zbatch_set_state2
subroutine zbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
Definition: batch_ops.F90:3912

batch_ops_oct_m::zbatch_axpy_function
subroutine, public zbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
Definition: batch_ops.F90:3000

batch_ops_oct_m::zbatch_set_points
subroutine zbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:4215

batch_ops_oct_m::zbatch_add_with_map
subroutine zbatch_add_with_map(np, map, xx, yy, zz)
Definition: batch_ops.F90:4289

batch_ops_oct_m::dbatch_axpy_function
subroutine, public dbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
Definition: batch_ops.F90:1099

batch_ops_oct_m::batch_get_points_accel
subroutine batch_get_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_get_points.
Definition: batch_ops.F90:341

batch_ops_oct_m::zbatch_set_state3
subroutine zbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
Definition: batch_ops.F90:3930

batch_ops_oct_m::batch_copy_with_map_cpu
subroutine batch_copy_with_map_cpu(np, map, xx, yy)
Definition: batch_ops.F90:673

batch_ops_oct_m::dbatch_scal_const
subroutine dbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
Definition: batch_ops.F90:1287

blas_oct_m
This module contains interfaces for BLAS routines You should not use these routines directly....
Definition: blas.F90:120

debug_oct_m
Definition: debug.F90:116

global_oct_m
Definition: global.F90:116

global_oct_m::m_zero
real(real64), parameter, public m_zero
Definition: global.F90:200

global_oct_m::not_in_openmp
logical pure function, public not_in_openmp()
Definition: global.F90:566

global_oct_m::m_z0
complex(real64), parameter, public m_z0
Definition: global.F90:210

lalg_basic_oct_m
Definition: lalg_basic.F90:116

math_oct_m
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117

messages_oct_m
Definition: messages.F90:117

messages_oct_m::messages_not_implemented
subroutine, public messages_not_implemented(feature, namespace)
Definition: messages.F90:1068

messages_oct_m::message
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162

messages_oct_m::messages_fatal
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410

profiling_oct_m
Definition: profiling.F90:118

profiling_oct_m::profiling_out
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:631

profiling_oct_m::profiling_in
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:554

types_oct_m
Definition: types.F90:116

types_oct_m::types_get_size
integer pure function, public types_get_size(this)
Definition: types.F90:154

types_oct_m::type_integer
type(type_t), parameter, public type_integer
Definition: types.F90:137

types_oct_m::type_float
type(type_t), parameter, public type_float
Definition: types.F90:135

accel_oct_m::accel_kernel_t
Definition: accel.F90:238

accel_oct_m::accel_mem_t
Definition: accel.F90:229

batch_oct_m::batch_t
Class defining batches of mesh functions.
Definition: batch.F90:162

true
int true(void)
Definition: symmetries_finite.c:3150