30 use iso_c_binding,
only: c_size_t
31 use,
intrinsic :: iso_fortran_env
89 integer,
public,
parameter :: &
90 ACCEL_MEM_READ_ONLY = 0, &
97 type(c_ptr) :: cuda_context
105#if defined(HAVE_CUDA)
106 type(c_ptr) :: cuda_device
114 type(accel_context_t) :: context
115 type(accel_device_t) :: device
116 type(c_ptr) :: cublas_handle
117 type(c_ptr) :: cuda_stream
118 type(c_ptr) :: module_map
119 integer :: max_block_size
120 integer(int64) :: shared_memory_size
121 integer(int64) :: global_memory_size
123 logical :: allow_CPU_only
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
135 integer(c_size_t) :: size = 0
138 logical :: allocated = .false.
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
148 logical :: initialized = .false.
149 type(accel_kernel_t),
pointer :: next
151 character(len=128) :: kernel_name
154 type(accel_t),
public :: accel
157 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
158 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
161 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi
162 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_complex
163 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors
164 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors_complex
165 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
166 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
167 type(accel_kernel_t),
public,
target,
save :: kernel_copy
168 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
169 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
170 type(accel_kernel_t),
public,
target,
save :: dpack
171 type(accel_kernel_t),
public,
target,
save :: zpack
172 type(accel_kernel_t),
public,
target,
save :: dunpack
173 type(accel_kernel_t),
public,
target,
save :: zunpack
174 type(accel_kernel_t),
public,
target,
save :: kernel_ghost_reorder
175 type(accel_kernel_t),
public,
target,
save :: kernel_density_real
176 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
177 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
178 type(accel_kernel_t),
public,
target,
save :: kernel_phase
179 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
180 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
181 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
182 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix_spinors
183 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_axpy
189 type(accel_kernel_t),
public,
target,
save :: dzmul
190 type(accel_kernel_t),
public,
target,
save :: zzmul
298 integer :: buffer_alloc_count
299 integer(int64) :: allocated_mem
300 type(accel_kernel_t),
pointer :: head
301 type(alloc_cache_t) :: memcache
307 enabled = accel%enabled
317 allow = accel%allow_CPU_only
326 type(mpi_grp_t),
intent(inout) :: base_grp
327 type(namespace_t),
intent(in) :: namespace
329 logical :: disable, default, run_benchmark
334 character(len=256) :: sys_name
340 buffer_alloc_count = 0
358 accel%enabled = .not. disable
361 if (accel%enabled)
then
362 message(1) =
'Octopus was compiled without Cuda support.'
390 if (idevice < 0)
then
398 if (idevice<0) idevice = 0
399 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
400 idevice, base_grp%rank)
403 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
404 " on ", trim(sys_name)
408 call cublas_init(accel%cublas_handle, accel%cuda_stream)
419 accel%max_block_dim(1) = int(dim, int64)
421 accel%max_block_dim(2) = int(dim, int64)
423 accel%max_block_dim(3) = int(dim, int64)
425 accel%max_grid_dim(1) = int(dim, int64)
427 accel%max_grid_dim(2) = int(dim, int64)
429 accel%max_grid_dim(3) = int(dim, int64)
435 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
441 accel%debug_flag =
"-g"
442#elif defined(HAVE_CUDA)
443 accel%debug_flag =
"-lineinfo"
453 flags =
' -DRTYPE_DOUBLE')
455 flags =
'-DRTYPE_COMPLEX')
457 flags =
'-DRTYPE_DOUBLE')
459 flags =
'-DRTYPE_COMPLEX')
509 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
513 if (run_benchmark)
then
534 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
535 if (accel%cuda_mpi)
then
537 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
553#if defined (HAVE_ACCEL)
558 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
574 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
589 character(kind=c_char) :: cval_str(257)
591 integer :: major, minor
592 character(len=256) :: val_str
601#ifdef __HIP_PLATFORM_AMD__
611#ifdef __HIP_PLATFORM_AMD__
619 cval_str = c_null_char
667 integer(int64) :: hits, misses
668 real(real64) :: volume_hits, volume_misses
684 if (.not. found)
exit
691 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
702 if (hits + misses > 0)
then
703 call messages_write(hits/real(hits + misses, real64)*100, fmt=
'(f6.1)', align_left = .
true.)
709 if (volume_hits + volume_misses > 0)
then
710 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
725 call cublas_end(accel%cublas_handle)
726 if (.not. accel%cuda_mpi)
then
727 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
731 if (buffer_alloc_count /= 0)
then
749 integer(int64),
intent(in) :: n(:)
750 integer(int64),
intent(in) :: blocksizes(:)
751 integer(int64),
intent(out) :: gridsizes(:)
755 dim = ubound(n, dim=1)
756 assert(dim == ubound(blocksizes, dim=1))
757 assert(dim == ubound(gridsizes, dim=1))
760 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
761 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
769 integer,
intent(in) :: n(:)
770 integer,
intent(in) :: blocksizes(:)
771 integer,
intent(out) :: gridsizes(:)
773 integer(int64) :: gridsizes64(size(gridsizes))
777 gridsizes = int(gridsizes64, int32)
784 integer(int64),
intent(in) :: n
785 integer(int64),
intent(in) :: blocksizes
786 integer(int64),
intent(out) :: gridsizes
788 integer(int64) :: temp(1)
799 integer,
intent(in) :: n
800 integer,
intent(in) :: blocksizes
801 integer,
intent(out) :: gridsizes
803 integer(int64) :: temp(1)
807 gridsizes = int(temp(1), int32)
816 integer(int64),
intent(in) :: n
817 integer(int64),
intent(in) :: pack_size
818 integer(int64),
dimension(3),
intent(out) :: gridsizes
819 integer(int64),
dimension(3),
intent(out) :: blocksizes
822 integer(int64) :: bsize, dim2, dim3
823 integer(int64),
dimension(3) :: nn
825 if(
present(kernel))
then
834 nn = (/pack_size, dim2, dim3/)
835 blocksizes = (/pack_size, bsize, 1_int64/)
846 integer,
intent(in) :: n
847 integer,
intent(in) :: pack_size
848 integer,
dimension(3),
intent(out) :: gridsizes
849 integer,
dimension(3),
intent(out) :: blocksizes
852 integer(int64) :: gridsizes64(3), blocksizes64(3)
855 gridsizes64, blocksizes64, kernel=kernel)
857 gridsizes = int(gridsizes64, int32)
858 blocksizes = int(blocksizes64, int32)
864 integer(int64),
intent(in) :: nn
866 integer(int64) :: modnn, bsize
875 modnn = mod(nn, bsize)
876 if (modnn /= 0) psize = psize + bsize - modnn
885 integer(int32),
intent(in) :: nn
895 integer,
intent(in) :: flags
896 type(
type_t),
intent(in) :: type
897 integer,
intent(in) :: size
898 logical,
optional,
intent(in) :: set_zero
899 logical,
optional,
intent(in) :: async
908 integer,
intent(in) :: flags
909 type(
type_t),
intent(in) :: type
910 integer(int64),
intent(in) :: size
911 logical,
optional,
intent(in) :: set_zero
912 logical,
optional,
intent(in) :: async
914 integer(int64) :: fsize
916 integer(int64) :: initialize_buffers
924 this%allocated = .
true.
930 if (.not. found)
then
933 call cuda_mem_alloc_async(this%mem, fsize)
940 buffer_alloc_count = buffer_alloc_count + 1
941 allocated_mem = allocated_mem + fsize
945 if (
present(set_zero))
then
946 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
948 initialize_buffers = accel%initialize_buffers
950 select case (initialize_buffers)
951 case (option__initializegpubuffers__yes)
953 case (option__initializegpubuffers__nan)
964 logical,
optional,
intent(in) :: async
967 integer(int64) :: fsize
971 if (this%size > 0)
then
980 call cuda_mem_free_async(this%mem)
987 buffer_alloc_count = buffer_alloc_count - 1
988 allocated_mem = allocated_mem + fsize
995 this%allocated = .false.
1005 integer,
intent(in) :: flags
1006 type(
type_t),
intent(in) :: type
1007 integer,
intent(in) :: required_size
1008 logical,
intent(in) :: set_zero
1009 logical,
optional,
intent(in) :: async
1030 allocated = this%allocated
1049 integer,
intent(in) :: narg
1071 integer(int64),
intent(in) :: gridsizes(:)
1072 integer(int64),
intent(in) :: blocksizes(:)
1073 integer(int64),
optional,
intent(in) :: shared_memory_size
1076 integer(int64) :: gsizes(1:3)
1077 integer(int64) :: bsizes(1:3)
1085 dim = ubound(gridsizes, dim=1)
1087 assert(dim == ubound(blocksizes, dim=1))
1090 if (any(gridsizes == 0))
return
1092 assert(all(blocksizes > 0))
1094 gsizes(1:dim) = gridsizes(1:dim)
1095 bsizes(1:dim) = blocksizes(1:dim)
1099 if (any(bsizes(1:3) > accel%max_block_dim(1:3)))
then
1100 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1101 message(2) =
"The following conditions should be fulfilled:"
1102 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", bsizes(1),
" <= ", accel%max_block_dim(1)
1103 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", bsizes(2),
" <= ", accel%max_block_dim(2)
1104 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", bsizes(3),
" <= ", accel%max_block_dim(3)
1105 message(6) =
"This is an internal error, please contact the developers."
1112 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1113 message(2) =
"The following condition should be fulfilled:"
1115 message(4) =
"This is an internal error, please contact the developers."
1120 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1121 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1122 message(2) =
"The following conditions should be fulfilled:"
1123 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1124 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1125 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1126 message(6) =
"This is an internal error, please contact the developers."
1130 if(
present(shared_memory_size))
then
1132 if (shared_memory_size > accel%shared_memory_size)
then
1133 message(1) =
"Shared memory too large in kernel "//trim(kernel%kernel_name)
1134 message(2) =
"The following condition should be fulfilled:"
1135 message(3) =
"Requested shared memory <= Available shared memory"
1136 write(
message(4),
'(a,f12.6,a)')
"Requested shared memory: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1137 write(
message(5),
'(a,f12.6,a)')
"Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0,
" Kb"
1138 message(6) =
"This is an internal error, please contact the developers."
1140 else if (shared_memory_size <= 0)
then
1141 message(1) =
"Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1142 write(
message(2),
'(a,f12.6,a)')
"Shared memory size requested: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1143 message(3) =
"This is an internal error, please contact the developers."
1164 integer,
intent(in) :: gridsizes(:)
1165 integer,
intent(in) :: blocksizes(:)
1166 integer(int64),
optional,
intent(in) :: shared_memory_size
1168 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1175 max_block_size = accel%max_block_size
1184 integer :: max_block_size
1190 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1191 if (debug%info .and. max_block_size /=
accel%max_block_size)
then
1192 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", max_block_size,
")", &
1193 "than available on the device (",
accel%max_block_size,
")"
1194 call messages_info(1)
1202 block_size = min(block_size, max_block_size)
1211 type(type_t),
intent(in) :: type
1212 integer(int8),
intent(in) :: val
1213 integer(int64),
intent(in) :: nval
1214 integer(int64),
optional,
intent(in) :: offset
1215 logical,
optional,
intent(in) :: async
1217 integer(int64) :: nval_, offset_, type_size
1227 if (
present(offset))
then
1229 if(offset > buffer%size)
then
1235 type_size = types_get_size(type)
1237 nval_ = nval*type_size
1240 if (
present(offset)) offset_ = offset*type_size
1242 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1243 if(.not. optional_default(async, .false.))
call accel_finish()
1252 type(type_t),
intent(in) :: type
1253 integer(int64),
intent(in) :: nval
1254 integer(int64),
optional,
intent(in) :: offset
1255 logical,
optional,
intent(in) :: async
1268 type(type_t),
intent(in) :: type
1269 integer(int32),
intent(in) :: nval
1270 integer(int32),
optional,
intent(in) :: offset
1271 logical,
optional,
intent(in) :: async
1275 if (
present(offset))
then
1288 integer,
parameter :: times = 10
1290 real(real64) :: time, stime
1291 real(real64) :: read_bw, write_bw
1293 real(real64),
allocatable :: data(:)
1295 call messages_new_line()
1296 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1297 call messages_new_line()
1298 call messages_info()
1300 call messages_write(
' Buffer size Read bw Write bw')
1301 call messages_new_line()
1302 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1303 call messages_info()
1307 safe_allocate(
data(1:size))
1310 stime = loct_clock()
1315 time = (loct_clock() - stime)/real(times, real64)
1317 write_bw = real(
size, real64) *8.0_real64/time
1319 stime = loct_clock()
1325 time = (loct_clock() - stime)/real(times, real64)
1326 read_bw = real(
size, real64) *8.0_real64/time
1328 call messages_write(size*8.0_real64/1024.0_real64**2)
1329 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1330 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1331 call messages_info()
1335 safe_deallocate_a(data)
1337 size = int(size*2.0)
1339 if (
size > 50000000)
exit
1351 call cuda_module_map_init(
accel%module_map)
1364 if (.not.
associated(
head))
exit
1365 next_head =>
head%next
1371 call cuda_module_map_end(
accel%module_map)
1381 character(len=*),
intent(in) :: file_name
1382 character(len=*),
intent(in) :: kernel_name
1383 character(len=*),
optional,
intent(in) :: flags
1386 character(len=1000) :: all_flags
1391 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
1394 all_flags =
'-I'//trim(conf%share)//
'/kernels/'//
" "//trim(
accel%debug_flag)
1396 if (
present(flags))
then
1397 all_flags = trim(all_flags)//
' '//trim(flags)
1400 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
1401 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1403 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1404 call cuda_alloc_arg_array(this%arguments)
1407 this%initialized = .
true.
1408 this%kernel_name = trim(kernel_name)
1410 call profiling_out(
"ACCEL_COMPILE")
1423 call cuda_free_arg_array(this%arguments)
1424 call cuda_release_kernel(this%cuda_kernel)
1428 this%initialized = .false.
1437 character(len=*),
intent(in) :: file_name
1438 character(len=*),
intent(in) :: kernel_name
1439 character(len=*),
optional,
intent(in) :: flags
1443 if (.not. this%initialized)
then
1456 size =
accel%global_memory_size
1464 size =
accel%shared_memory_size
1470 integer,
intent(in) :: dim
1475 if (dim == 1)
size = 2**30
1482 integer,
intent(in) :: stream_number
1488 call cuda_set_stream(
accel%cuda_stream, stream_number)
1489 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
1499 integer,
intent(inout) :: stream_number
1505 call cuda_get_stream(stream_number)
1519 call cuda_synchronize_all_streams()
1527 type(c_ptr),
intent(in) :: buffer
1528 integer(int64),
intent(in) :: offset
1529 type(c_ptr) :: buffer_offset
1533 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1536 buffer_offset = buffer
1542 type(c_ptr),
intent(in) :: buffer
1543 integer(int64),
intent(in) :: offset
1544 type(c_ptr) :: buffer_offset
1548 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1551 buffer_offset = buffer
1557 type(c_ptr),
intent(in) :: buffer
1561 call cuda_clean_pointer(buffer)
1570 integer(int64),
intent(in) :: size
1571 integer(int64),
intent(out) :: grid_size
1572 integer(int64),
intent(out) :: thread_block_size
1575#ifdef __HIP_PLATFORM_AMD__
1578 thread_block_size =
size
1581 thread_block_size =
accel%warp_size
1588#include "accel_inc.F90"
1591#include "complex.F90"
1592#include "accel_inc.F90"
1595#include "integer.F90"
1596#include "accel_inc.F90"
1599#include "integer8.F90"
1600#include "accel_inc.F90"
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
type(accel_kernel_t), target, save, public dkernel_batch_dotp
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
integer function, public accel_kernel_block_size(kernel)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine, public accel_free_buffer(this, async)
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine accel_check_bandwidth()
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_dotp
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_shared_memory_size()
integer pure function, public accel_max_block_size()
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
System information (time, memory, sysname)
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
type(type_t), public type_float
type(type_t), public type_cmplx
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)