30 use iso_c_binding,
only: c_null_ptr, c_size_t
31 use,
intrinsic :: iso_fortran_env
90 integer,
public,
parameter :: &
91 ACCEL_MEM_READ_ONLY = 0, &
98 type(c_ptr) :: cuda_context
106#if defined(HAVE_CUDA)
107 type(c_ptr) :: cuda_device
115 type(accel_context_t) :: context
116 type(accel_device_t) :: device
117 type(c_ptr) :: cublas_handle
118 type(c_ptr) :: cuda_stream
119 type(c_ptr) :: module_map
120 integer :: max_block_size
121 integer(int64) :: shared_memory_size
122 integer(int64) :: global_memory_size
124 logical :: allow_CPU_only
127 integer(int64) :: initialize_buffers
128 character(len=32) :: debug_flag
129 integer(int64) :: max_block_dim(3)
130 integer(int64) :: max_grid_dim(3)
136 integer(c_size_t) :: size = 0
139 logical :: allocated = .false.
145 type(c_ptr) :: cuda_kernel
146 type(c_ptr) :: cuda_module
147 type(c_ptr) :: arguments
149 logical :: initialized = .false.
150 type(accel_kernel_t),
pointer :: next
152 character(len=128) :: kernel_name
155 type(accel_t),
public :: accel
158 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
162 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi
163 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_complex
164 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors
165 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
167 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
168 type(accel_kernel_t),
public,
target,
save :: kernel_copy
169 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
170 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
171 type(accel_kernel_t),
public,
target,
save :: dpack
172 type(accel_kernel_t),
public,
target,
save :: zpack
173 type(accel_kernel_t),
public,
target,
save :: dunpack
174 type(accel_kernel_t),
public,
target,
save :: zunpack
175 type(accel_kernel_t),
public,
target,
save :: kernel_ghost_reorder
176 type(accel_kernel_t),
public,
target,
save :: kernel_density_real
177 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
178 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
179 type(accel_kernel_t),
public,
target,
save :: kernel_phase
180 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
181 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
182 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
183 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix_spinors
184 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_axpy
190 type(accel_kernel_t),
public,
target,
save :: dzmul
191 type(accel_kernel_t),
public,
target,
save :: zzmul
299 integer :: buffer_alloc_count
300 integer(int64) :: allocated_mem
301 type(accel_kernel_t),
pointer :: head
302 type(alloc_cache_t) :: memcache
308 enabled = accel%enabled
318 allow = accel%allow_CPU_only
327 type(mpi_grp_t),
intent(inout) :: base_grp
328 type(namespace_t),
intent(in) :: namespace
330 logical :: disable, default, run_benchmark
335 character(len=256) :: sys_name
341 buffer_alloc_count = 0
359 accel%enabled = .not. disable
362 if (accel%enabled)
then
363 message(1) =
'Octopus was compiled without Cuda support.'
391 if (idevice < 0)
then
399 if (idevice<0) idevice = 0
400 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
401 idevice, base_grp%rank)
404 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
405 " on ", trim(sys_name)
409 call cublas_init(accel%cublas_handle, accel%cuda_stream)
420 accel%max_block_dim(1) = int(dim, int64)
422 accel%max_block_dim(2) = int(dim, int64)
424 accel%max_block_dim(3) = int(dim, int64)
426 accel%max_grid_dim(1) = int(dim, int64)
428 accel%max_grid_dim(2) = int(dim, int64)
430 accel%max_grid_dim(3) = int(dim, int64)
436 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
442 accel%debug_flag =
"-g"
443#elif defined(HAVE_CUDA)
444 accel%debug_flag =
"-lineinfo"
454 flags =
' -DRTYPE_DOUBLE')
456 flags =
'-DRTYPE_COMPLEX')
458 flags =
'-DRTYPE_DOUBLE')
460 flags =
'-DRTYPE_COMPLEX')
510 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
514 if (run_benchmark)
then
535 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
536 if (accel%cuda_mpi)
then
538 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
554#if defined (HAVE_ACCEL)
559 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
575 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
590 character(kind=c_char) :: cval_str(257)
592 integer :: major, minor
593 character(len=256) :: val_str
602#ifdef __HIP_PLATFORM_AMD__
612#ifdef __HIP_PLATFORM_AMD__
620 cval_str = c_null_char
668 integer(int64) :: hits, misses
669 real(real64) :: volume_hits, volume_misses
685 if (.not. found)
exit
692 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
703 if (hits + misses > 0)
then
704 call messages_write(hits/real(hits + misses, real64)*100, fmt=
'(f6.1)', align_left = .
true.)
710 if (volume_hits + volume_misses > 0)
then
711 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
726 call cublas_end(accel%cublas_handle)
727 if (.not. accel%cuda_mpi)
then
728 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
732 if (buffer_alloc_count /= 0)
then
750 integer(int64),
intent(in) :: n(:)
751 integer(int64),
intent(in) :: blocksizes(:)
752 integer(int64),
intent(out) :: gridsizes(:)
756 dim = ubound(n, dim=1)
757 assert(dim == ubound(blocksizes, dim=1))
758 assert(dim == ubound(gridsizes, dim=1))
761 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
762 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
770 integer,
intent(in) :: n(:)
771 integer,
intent(in) :: blocksizes(:)
772 integer,
intent(out) :: gridsizes(:)
774 integer(int64) :: gridsizes64(size(gridsizes))
778 gridsizes = int(gridsizes64, int32)
785 integer(int64),
intent(in) :: n
786 integer(int64),
intent(in) :: blocksizes
787 integer(int64),
intent(out) :: gridsizes
789 integer(int64) :: temp(1)
800 integer,
intent(in) :: n
801 integer,
intent(in) :: blocksizes
802 integer,
intent(out) :: gridsizes
804 integer(int64) :: temp(1)
808 gridsizes = int(temp(1), int32)
817 integer(int64),
intent(in) :: n
818 integer(int64),
intent(in) :: pack_size
819 integer(int64),
dimension(3),
intent(out) :: gridsizes
820 integer(int64),
dimension(3),
intent(out) :: blocksizes
823 integer(int64) :: bsize, dim2, dim3
824 integer(int64),
dimension(3) :: nn
826 if(
present(kernel))
then
835 nn = (/pack_size, dim2, dim3/)
836 blocksizes = (/pack_size, bsize, 1_int64/)
847 integer,
intent(in) :: n
848 integer,
intent(in) :: pack_size
849 integer,
dimension(3),
intent(out) :: gridsizes
850 integer,
dimension(3),
intent(out) :: blocksizes
853 integer(int64) :: gridsizes64(3), blocksizes64(3)
856 gridsizes64, blocksizes64, kernel=kernel)
858 gridsizes = int(gridsizes64, int32)
859 blocksizes = int(blocksizes64, int32)
865 integer(int64),
intent(in) :: nn
867 integer(int64) :: modnn, bsize
876 modnn = mod(nn, bsize)
877 if (modnn /= 0) psize = psize + bsize - modnn
886 integer(int32),
intent(in) :: nn
896 integer,
intent(in) :: flags
897 type(
type_t),
intent(in) :: type
898 integer,
intent(in) :: size
899 logical,
optional,
intent(in) :: set_zero
900 logical,
optional,
intent(in) :: async
909 integer,
intent(in) :: flags
910 type(
type_t),
intent(in) :: type
911 integer(int64),
intent(in) :: size
912 logical,
optional,
intent(in) :: set_zero
913 logical,
optional,
intent(in) :: async
915 integer(int64) :: fsize
917 integer(int64) :: initialize_buffers
925 this%allocated = .
true.
931 if (.not. found)
then
934 call cuda_mem_alloc_async(this%mem, fsize)
941 buffer_alloc_count = buffer_alloc_count + 1
942 allocated_mem = allocated_mem + fsize
946 if (
present(set_zero))
then
947 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
949 initialize_buffers = accel%initialize_buffers
951 select case (initialize_buffers)
952 case (option__initializegpubuffers__yes)
954 case (option__initializegpubuffers__nan)
965 logical,
optional,
intent(in) :: async
968 integer(int64) :: fsize
972 if (this%size > 0)
then
981 call cuda_mem_free_async(this%mem)
988 buffer_alloc_count = buffer_alloc_count - 1
989 allocated_mem = allocated_mem + fsize
996 this%allocated = .false.
1012 this%mem = c_null_ptr
1016 this%allocated = .false.
1026 integer,
intent(in) :: flags
1027 type(
type_t),
intent(in) :: type
1028 integer,
intent(in) :: required_size
1029 logical,
intent(in) :: set_zero
1030 logical,
optional,
intent(in) :: async
1051 allocated = this%allocated
1070 integer,
intent(in) :: narg
1092 integer(int64),
intent(in) :: gridsizes(:)
1093 integer(int64),
intent(in) :: blocksizes(:)
1094 integer(int64),
optional,
intent(in) :: shared_memory_size
1097 integer(int64) :: gsizes(1:3)
1098 integer(int64) :: bsizes(1:3)
1106 dim = ubound(gridsizes, dim=1)
1108 assert(dim == ubound(blocksizes, dim=1))
1111 if (any(gridsizes == 0))
return
1113 assert(all(blocksizes > 0))
1115 gsizes(1:dim) = gridsizes(1:dim)
1116 bsizes(1:dim) = blocksizes(1:dim)
1120 if (any(bsizes(1:3) > accel%max_block_dim(1:3)))
then
1121 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1122 message(2) =
"The following conditions should be fulfilled:"
1123 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", bsizes(1),
" <= ", accel%max_block_dim(1)
1124 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", bsizes(2),
" <= ", accel%max_block_dim(2)
1125 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", bsizes(3),
" <= ", accel%max_block_dim(3)
1126 message(6) =
"This is an internal error, please contact the developers."
1133 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1134 message(2) =
"The following condition should be fulfilled:"
1136 message(4) =
"This is an internal error, please contact the developers."
1141 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1142 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1143 message(2) =
"The following conditions should be fulfilled:"
1144 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1145 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1146 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1147 message(6) =
"This is an internal error, please contact the developers."
1151 if(
present(shared_memory_size))
then
1153 if (shared_memory_size > accel%shared_memory_size)
then
1154 message(1) =
"Shared memory too large in kernel "//trim(kernel%kernel_name)
1155 message(2) =
"The following condition should be fulfilled:"
1156 message(3) =
"Requested shared memory <= Available shared memory"
1157 write(
message(4),
'(a,f12.6,a)')
"Requested shared memory: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1158 write(
message(5),
'(a,f12.6,a)')
"Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0,
" Kb"
1159 message(6) =
"This is an internal error, please contact the developers."
1161 else if (shared_memory_size <= 0)
then
1162 message(1) =
"Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1163 write(
message(2),
'(a,f12.6,a)')
"Shared memory size requested: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1164 message(3) =
"This is an internal error, please contact the developers."
1185 integer,
intent(in) :: gridsizes(:)
1186 integer,
intent(in) :: blocksizes(:)
1187 integer(int64),
optional,
intent(in) :: shared_memory_size
1189 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1196 max_block_size = accel%max_block_size
1205 integer :: max_block_size
1211 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1212 if (debug%info .and. max_block_size /=
accel%max_block_size)
then
1213 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", max_block_size,
")", &
1214 "than available on the device (",
accel%max_block_size,
")"
1215 call messages_info(1)
1223 block_size = min(block_size, max_block_size)
1232 type(type_t),
intent(in) :: type
1233 integer(int8),
intent(in) :: val
1234 integer(int64),
intent(in) :: nval
1235 integer(int64),
optional,
intent(in) :: offset
1236 logical,
optional,
intent(in) :: async
1238 integer(int64) :: nval_, offset_, type_size
1248 if (
present(offset))
then
1250 if(offset > buffer%size)
then
1256 type_size = types_get_size(type)
1258 nval_ = nval*type_size
1261 if (
present(offset)) offset_ = offset*type_size
1263 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1264 if(.not. optional_default(async, .false.))
call accel_finish()
1273 type(type_t),
intent(in) :: type
1274 integer(int64),
intent(in) :: nval
1275 integer(int64),
optional,
intent(in) :: offset
1276 logical,
optional,
intent(in) :: async
1289 type(type_t),
intent(in) :: type
1290 integer(int32),
intent(in) :: nval
1291 integer(int32),
optional,
intent(in) :: offset
1292 logical,
optional,
intent(in) :: async
1296 if (
present(offset))
then
1309 integer,
parameter :: times = 10
1311 real(real64) :: time, stime
1312 real(real64) :: read_bw, write_bw
1314 real(real64),
allocatable :: data(:)
1316 call messages_new_line()
1317 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1318 call messages_new_line()
1319 call messages_info()
1321 call messages_write(
' Buffer size Read bw Write bw')
1322 call messages_new_line()
1323 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1324 call messages_info()
1328 safe_allocate(
data(1:size))
1331 stime = loct_clock()
1336 time = (loct_clock() - stime)/real(times, real64)
1338 write_bw = real(
size, real64) *8.0_real64/time
1340 stime = loct_clock()
1346 time = (loct_clock() - stime)/real(times, real64)
1347 read_bw = real(
size, real64) *8.0_real64/time
1349 call messages_write(size*8.0_real64/1024.0_real64**2)
1350 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1351 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1352 call messages_info()
1356 safe_deallocate_a(data)
1358 size = int(size*2.0)
1360 if (
size > 50000000)
exit
1372 call cuda_module_map_init(
accel%module_map)
1385 if (.not.
associated(
head))
exit
1386 next_head =>
head%next
1392 call cuda_module_map_end(
accel%module_map)
1402 character(len=*),
intent(in) :: file_name
1403 character(len=*),
intent(in) :: kernel_name
1404 character(len=*),
optional,
intent(in) :: flags
1407 character(len=1000) :: all_flags
1412 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
1415 all_flags =
'-I'//trim(conf%share)//
'/kernels/'//
" "//trim(
accel%debug_flag)
1417 if (
present(flags))
then
1418 all_flags = trim(all_flags)//
' '//trim(flags)
1421 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
1422 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1424 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1425 call cuda_alloc_arg_array(this%arguments)
1428 this%initialized = .
true.
1429 this%kernel_name = trim(kernel_name)
1431 call profiling_out(
"ACCEL_COMPILE")
1444 call cuda_free_arg_array(this%arguments)
1445 call cuda_release_kernel(this%cuda_kernel)
1449 this%initialized = .false.
1458 character(len=*),
intent(in) :: file_name
1459 character(len=*),
intent(in) :: kernel_name
1460 character(len=*),
optional,
intent(in) :: flags
1464 if (.not. this%initialized)
then
1477 size =
accel%global_memory_size
1485 size =
accel%shared_memory_size
1491 integer,
intent(in) :: dim
1496 if (dim == 1)
size = 2**30
1503 integer,
intent(in) :: stream_number
1509 call cuda_set_stream(
accel%cuda_stream, stream_number)
1510 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
1520 integer,
intent(inout) :: stream_number
1526 call cuda_get_stream(stream_number)
1540 call cuda_synchronize_all_streams()
1548 type(c_ptr),
intent(in) :: buffer
1549 integer(int64),
intent(in) :: offset
1550 type(c_ptr) :: buffer_offset
1554 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1557 buffer_offset = buffer
1563 type(c_ptr),
intent(in) :: buffer
1564 integer(int64),
intent(in) :: offset
1565 type(c_ptr) :: buffer_offset
1569 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1572 buffer_offset = buffer
1578 type(c_ptr),
intent(in) :: buffer
1582 call cuda_clean_pointer(buffer)
1591 integer(int64),
intent(in) :: size
1592 integer(int64),
intent(out) :: grid_size
1593 integer(int64),
intent(out) :: thread_block_size
1596#ifdef __HIP_PLATFORM_AMD__
1599 thread_block_size =
size
1602 thread_block_size =
accel%warp_size
1609#include "accel_inc.F90"
1612#include "complex.F90"
1613#include "accel_inc.F90"
1616#include "integer.F90"
1617#include "accel_inc.F90"
1620#include "integer8.F90"
1621#include "accel_inc.F90"
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
type(accel_kernel_t), target, save, public dkernel_batch_dotp
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
integer function, public accel_kernel_block_size(kernel)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine, public accel_free_buffer(this, async)
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine accel_check_bandwidth()
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine, public accel_detach_buffer(this)
Clear a buffer handle without freeing device memory.
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_dotp
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_shared_memory_size()
integer pure function, public accel_max_block_size()
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
System information (time, memory, sysname)
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
type(type_t), public type_float
type(type_t), public type_cmplx
type(type_t), public type_none
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)