170 class(batch_t),
intent(inout) :: this
171 integer,
optional,
intent(in) :: np
173 logical,
optional,
intent(in) :: async
175 integer :: ist_linear, ist, ip, np_
183 select case (this%status())
186 assert(np_ <= int(this%pack_size(2), int32))
191 assert(np_ <= int(this%pack_size(2), int32))
196 do ist = 1, int(this%pack_size(1), int32)
197 this%dff_pack(ist, ip) =
m_zero
204 do ist = 1, int(this%pack_size(1), int32)
205 this%zff_pack(ist, ip) =
m_z0
213 assert(np_ <= ubound(this%dff_linear, dim=1))
214 do ist_linear = 1, this%nst_linear
217 this%dff_linear(ip, ist_linear) =
m_zero
222 assert(np_ <= ubound(this%zff_linear, dim=1))
223 do ist_linear = 1, this%nst_linear
226 this%zff_linear(ip, ist_linear) =
m_z0
232 message(1) =
"batch_set_zero: unknown batch status."
246 class(batch_t),
intent(in) :: this
247 integer,
intent(in) :: sp
248 integer,
intent(in) :: ep
249 type(accel_mem_t),
intent(inout) :: psi
250 integer,
intent(in) :: ldpsi1
251 integer,
intent(in) :: ldpsi2
253 integer :: tsize, ii, it
254 type(accel_kernel_t),
save :: kernel
255 integer,
allocatable :: linear_to_ist(:), linear_to_idim(:)
256 type(accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim
261 select case (this%status())
268 safe_allocate(linear_to_ist(1:this%nst_linear*tsize))
269 safe_allocate(linear_to_idim(1:this%nst_linear*tsize))
270 do ii = 1, this%nst_linear
272 linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1
273 linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1
295 call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))
299 safe_deallocate_a(linear_to_ist)
300 safe_deallocate_a(linear_to_idim)
313 class(
batch_t),
intent(inout) :: this
314 integer,
intent(in) :: sp
315 integer,
intent(in) :: ep
317 integer,
intent(in) :: ldpsi1
318 integer,
intent(in) :: ldpsi2
320 integer :: tsize, ii, it
322 integer,
allocatable :: linear_to_ist(:), linear_to_idim(:)
323 type(
accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim
328 select case (this%status())
335 safe_allocate(linear_to_ist(1:this%nst_linear*tsize))
336 safe_allocate(linear_to_idim(1:this%nst_linear*tsize))
337 do ii = 1, this%nst_linear
339 linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1
340 linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1
362 call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))
366 safe_deallocate_a(linear_to_ist)
367 safe_deallocate_a(linear_to_idim)
381 integer pure function batch_points_block_size() result(block_size)
391 integer,
intent(in) :: np
392 class(batch_t),
intent(in) :: xx
393 class(batch_t),
intent(in) :: yy
394 class(batch_t),
intent(inout) :: zz
395 logical,
optional,
intent(in) :: conjugate_yy
399 integer(int64),
dimension(3) :: gsizes, bsizes
400 type(accel_kernel_t),
save :: kernel
404 call xx%check_compatibility_with(yy)
405 call xx%check_compatibility_with(zz)
407 conj_yy = optional_default(conjugate_yy, .
true.)
409 select case (xx%status())
410 case (batch_not_packed)
411 if (xx%type() == type_cmplx)
then
414 do ii = 1, xx%nst_linear
417 zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*conjg(yy%zff_linear(ip, ii))
424 do ii = 1, xx%nst_linear
427 zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*yy%zff_linear(ip, ii)
435 do ii = 1, xx%nst_linear
438 zz%dff_linear(ip, ii) = xx%dff_linear(ip, ii)*yy%dff_linear(ip, ii)
446 if (xx%type() == type_cmplx)
then
451 do ii = 1, xx%nst_linear
452 zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*conjg(yy%zff_pack(ii, ip))
459 do ii = 1, xx%nst_linear
460 zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*yy%zff_pack(ii, ip)
468 do ii = 1, xx%nst_linear
469 zz%dff_pack(ii, ip) = xx%dff_pack(ii, ip)*yy%dff_pack(ii, ip)
474 case (batch_device_packed)
475 if (xx%type() == type_cmplx)
then
477 call accel_kernel_start_call(kernel,
'batch_mul.cu',
'zmul_conj')
479 call accel_kernel_start_call(kernel,
'batch_mul.cu',
'zmul')
482 call accel_set_kernel_arg(kernel, 0, np)
483 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
484 call accel_set_kernel_arg(kernel, 2,
log2(int(xx%pack_size(1), int32)))
485 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
486 call accel_set_kernel_arg(kernel, 4,
log2(int(yy%pack_size(1), int32)))
487 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
488 call accel_set_kernel_arg(kernel, 6,
log2(int(zz%pack_size(1), int32)))
490 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)
492 call accel_kernel_start_call(kernel,
'batch_mul.cu',
'dmul')
494 call accel_set_kernel_arg(kernel, 0, np)
495 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
496 call accel_set_kernel_arg(kernel, 2,
log2(int(xx%pack_size_real(1), int32)))
497 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
498 call accel_set_kernel_arg(kernel, 4,
log2(int(yy%pack_size_real(1), int32)))
499 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
500 call accel_set_kernel_arg(kernel, 6,
log2(int(zz%pack_size_real(1), int32)))
502 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
505 call accel_kernel_run(kernel, gsizes, bsizes)
513 integer,
intent(in) :: np
514 integer,
intent(in) :: map(:)
515 class(batch_t),
intent(in) :: xx
516 class(batch_t),
intent(in) :: yy
517 class(batch_t),
intent(inout) :: zz
518 type(accel_mem_t) :: buff_map
522 if (xx%status() /= batch_device_packed)
then
523 if (xx%type() == type_float)
then
530 call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)
531 call accel_write_buffer(buff_map, np, map)
533 call accel_free_buffer(buff_map)
541 integer,
intent(in) :: np
542 class(accel_mem_t),
intent(in) :: map
543 class(batch_t),
intent(in) :: xx
544 class(batch_t),
intent(in) :: yy
545 class(batch_t),
intent(inout) :: zz
547 type(accel_kernel_t),
save :: kernel
548 integer(int64),
dimension(3) :: gsizes, bsizes
552 call accel_kernel_start_call(kernel,
'copy.cu',
'add_with_map')
554 call accel_set_kernel_arg(kernel, 0, np)
555 call accel_set_kernel_arg(kernel, 1, map)
556 call accel_set_kernel_arg(kernel, 2, xx%ff_device)
557 call accel_set_kernel_arg(kernel, 3,
log2(int(xx%pack_size_real(1), int32)))
558 call accel_set_kernel_arg(kernel, 4, yy%ff_device)
559 call accel_set_kernel_arg(kernel, 5,
log2(int(yy%pack_size_real(1), int32)))
560 call accel_set_kernel_arg(kernel, 6, zz%ff_device)
561 call accel_set_kernel_arg(kernel, 7,
log2(int(zz%pack_size_real(1), int32)))
564 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
566 call accel_kernel_run(kernel, gsizes, bsizes)
573 integer,
intent(in) :: np
574 integer,
intent(in) :: map(:)
575 class(batch_t),
intent(in) :: xx
576 class(batch_t),
intent(inout) :: yy
577 type(accel_mem_t) :: buff_map
581 if (xx%status() /= batch_device_packed)
then
582 if (xx%type() == type_float)
then
589 call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)
590 call accel_write_buffer(buff_map, np, map)
592 call accel_free_buffer(buff_map)
600 integer,
intent(in) :: np
601 class(accel_mem_t),
intent(in) :: map
602 class(batch_t),
intent(in) :: xx
603 class(batch_t),
intent(inout) :: yy
605 type(accel_kernel_t),
save :: kernel
606 integer(int64),
dimension(3) :: gsizes, bsizes
610 call accel_kernel_start_call(kernel,
'copy.cu',
'copy_with_map')
614 call accel_set_kernel_arg(kernel, 0, np)
615 call accel_set_kernel_arg(kernel, 1, map)
616 call accel_set_kernel_arg(kernel, 2, xx%ff_device)
617 call accel_set_kernel_arg(kernel, 3,
log2(int(xx%pack_size_real(1), int32)))
618 call accel_set_kernel_arg(kernel, 4, yy%ff_device)
619 call accel_set_kernel_arg(kernel, 5,
log2(int(yy%pack_size_real(1), int32)))
622 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
624 call accel_kernel_run(kernel, gsizes, bsizes)
636 integer,
intent(in) :: np
637 class(batch_t),
intent(in) :: xx
638 class(batch_t),
intent(inout) :: yy
639 class(batch_t),
intent(inout) :: zz
641 integer :: ist_linear, ip
642 type(accel_kernel_t),
save :: kernel
643 integer(int64),
dimension(3) :: gsizes, bsizes
647 assert(xx%type() == type_cmplx)
648 assert(yy%type() == type_float)
649 assert(zz%type() == type_float)
650 assert(xx%status() == yy%status())
651 assert(xx%status() == zz%status())
653 select case (xx%status())
654 case (batch_not_packed)
655 do ist_linear = 1, xx%nst_linear
658 yy%dff_linear(ip, ist_linear) = real(xx%zff_linear(ip, ist_linear), real64)
659 zz%dff_linear(ip, ist_linear) = aimag(xx%zff_linear(ip, ist_linear))
665 do ist_linear = 1, xx%nst_linear
666 yy%dff_pack(ist_linear, ip) = real(xx%zff_pack(ist_linear, ip), real64)
667 zz%dff_pack(ist_linear, ip) = aimag(xx%zff_pack(ist_linear, ip))
670 case (batch_device_packed)
671 call accel_kernel_start_call(kernel,
'split.cu',
'split_complex')
673 call accel_set_kernel_arg(kernel, 0, int(xx%pack_size(2), int32))
674 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
675 call accel_set_kernel_arg(kernel, 2,
log2(int(xx%pack_size(1), int32)))
676 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
677 call accel_set_kernel_arg(kernel, 4,
log2(int(yy%pack_size(1), int32)))
678 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
679 call accel_set_kernel_arg(kernel, 6,
log2(int(zz%pack_size(1), int32)))
682 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)
684 call accel_kernel_run(kernel, gsizes, bsizes)
691#include "batch_ops_inc.F90"
694#include "complex.F90"
695#include "batch_ops_inc.F90"
batchified version of the BLAS axpy routine:
batchified multiplication by mesh function with optional conjugation:
batchified scale with optional conjugation:
scale a batch by a constant or vector
There are several ways how to call batch_set_state and batch_get_state:
double log2(double __x) __attribute__((__nothrow__
subroutine, public accel_free_buffer(this, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
integer, parameter, public accel_mem_read_only
This module implements batches of mesh functions.
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
This module implements common operations on batches of mesh functions.
subroutine zbatch_get_state3(this, ii, np, psi, async)
subroutine dbatch_get_state3(this, ii, np, psi, async)
subroutine batch_copy_with_map_accel(np, map, xx, yy)
subroutine zbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
subroutine, public zbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
subroutine, public dbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
subroutine dbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
subroutine dbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
subroutine zbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
subroutine zbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
subroutine dbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
subroutine dbatch_scal_vec(np, aa, xx, a_start, a_full)
scale all functions in a batch by state dependent constant
subroutine dbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
subroutine dbatch_copy_with_map(np, map, xx, yy)
subroutine, public dbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
subroutine dbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
subroutine dbatch_get_state2(this, index, np, psi, async)
subroutine dbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
This routine applies an 'pair-wise' axpy operation to all functions of the batches xx and yy,...
subroutine dbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
subroutine zbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
subroutine zbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
subroutine zbatch_get_state2(this, index, np, psi, async)
subroutine zbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
subroutine, public batch_split_complex(np, xx, yy, zz)
extract the real and imaginary parts of a complex batch
subroutine batch_add_with_map_accel(np, map, xx, yy, zz)
subroutine dbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
subroutine dbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
subroutine zbatch_copy_with_map(np, map, xx, yy)
subroutine dbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
subroutine, public batch_set_zero(this, np, async)
fill all mesh functions of the batch with zero
subroutine, public zbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
subroutine zbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
This routine applies an 'pair-wise' axpy operation to all functions of the batches xx and yy,...
subroutine batch_set_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_set_points.
subroutine dbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
subroutine dbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
subroutine dbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
subroutine zbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
integer pure function, public batch_points_block_size()
determine the device block size
subroutine dbatch_add_with_map(np, map, xx, yy, zz)
subroutine zbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
subroutine zbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
subroutine batch_mul_cj(np, xx, yy, zz, conjugate_yy)
Point-wise multiply two batches with optional conjugation on yy: zz_i = xx_i * CJ(yy_i)
subroutine zbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
subroutine batch_add_with_map_cpu(np, map, xx, yy, zz)
subroutine zbatch_scal_vec(np, aa, xx, a_start, a_full)
scale all functions in a batch by state dependent constant
subroutine zbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
subroutine, public zbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
subroutine zbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
subroutine zbatch_add_with_map(np, map, xx, yy, zz)
subroutine, public dbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
subroutine batch_get_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_get_points.
subroutine zbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
subroutine batch_copy_with_map_cpu(np, map, xx, yy)
subroutine dbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
This module contains interfaces for BLAS routines You should not use these routines directly....
real(real64), parameter, public m_zero
logical pure function, public not_in_openmp()
complex(real64), parameter, public m_z0
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_not_implemented(feature, namespace)
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
type(type_t), public type_float
type(type_t), public type_integer
integer pure function, public types_get_size(this)
Class defining batches of mesh functions.