30 use iso_c_binding,
only: c_null_ptr, c_size_t
31 use,
intrinsic :: iso_fortran_env
91 integer,
public,
parameter :: &
92 ACCEL_MEM_READ_ONLY = 0, &
99 type(c_ptr) :: cuda_context
107#if defined(HAVE_CUDA)
108 type(c_ptr) :: cuda_device
116 type(accel_context_t) :: context
117 type(accel_device_t) :: device
118 type(c_ptr) :: cublas_handle
119 type(c_ptr) :: cuda_stream
120 type(c_ptr) :: module_map
121 integer :: max_block_size
122 integer(int64) :: shared_memory_size
123 integer(int64) :: global_memory_size
125 logical :: allow_CPU_only
128 integer(int64) :: initialize_buffers
129 character(len=32) :: debug_flag
130 integer(int64) :: max_block_dim(3)
131 integer(int64) :: max_grid_dim(3)
137 integer(c_size_t) :: size = 0
140 logical :: allocated = .false.
146 type(c_ptr) :: cuda_kernel
147 type(c_ptr) :: cuda_module
148 type(c_ptr) :: arguments
150 logical :: initialized = .false.
151 type(accel_kernel_t),
pointer :: next
153 character(len=128) :: kernel_name
156 type(accel_t),
public :: accel
159 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
160 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
163 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi
164 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_complex
165 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors
166 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors_complex
167 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
168 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
169 type(accel_kernel_t),
public,
target,
save :: kernel_copy
170 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
171 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
172 type(accel_kernel_t),
public,
target,
save :: dpack
173 type(accel_kernel_t),
public,
target,
save :: zpack
174 type(accel_kernel_t),
public,
target,
save :: dunpack
175 type(accel_kernel_t),
public,
target,
save :: zunpack
176 type(accel_kernel_t),
public,
target,
save :: kernel_ghost_reorder
177 type(accel_kernel_t),
public,
target,
save :: kernel_density_real
178 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
179 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
180 type(accel_kernel_t),
public,
target,
save :: kernel_phase
181 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
182 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
183 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
184 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix_spinors
185 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_axpy
191 type(accel_kernel_t),
public,
target,
save :: dzmul
192 type(accel_kernel_t),
public,
target,
save :: zzmul
300 integer :: buffer_alloc_count
301 integer(int64) :: allocated_mem
302 type(accel_kernel_t),
pointer :: head
303 type(alloc_cache_t) :: memcache
309 enabled = accel%enabled
319 allow = accel%allow_CPU_only
328 type(mpi_grp_t),
intent(inout) :: base_grp
329 type(namespace_t),
intent(in) :: namespace
331 logical :: disable, default, run_benchmark
336 character(len=256) :: sys_name
342 buffer_alloc_count = 0
360 accel%enabled = .not. disable
363 if (accel%enabled)
then
364 message(1) =
'Octopus was compiled without Cuda support.'
392 if (idevice < 0)
then
400 if (idevice<0) idevice = 0
401 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
402 idevice, base_grp%rank)
405 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
406 " on ", trim(sys_name)
410 call cublas_init(accel%cublas_handle, accel%cuda_stream)
421 accel%max_block_dim(1) = int(dim, int64)
423 accel%max_block_dim(2) = int(dim, int64)
425 accel%max_block_dim(3) = int(dim, int64)
427 accel%max_grid_dim(1) = int(dim, int64)
429 accel%max_grid_dim(2) = int(dim, int64)
431 accel%max_grid_dim(3) = int(dim, int64)
437 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
443 accel%debug_flag =
"-g"
444#elif defined(HAVE_CUDA)
445 accel%debug_flag =
"-lineinfo"
455 flags =
' -DRTYPE_DOUBLE')
457 flags =
'-DRTYPE_COMPLEX')
459 flags =
'-DRTYPE_DOUBLE')
461 flags =
'-DRTYPE_COMPLEX')
511 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
515 if (run_benchmark)
then
536 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
537 if (accel%cuda_mpi)
then
539 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
555#if defined (HAVE_ACCEL)
560 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
576 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
591 character(kind=c_char) :: cval_str(257)
593 integer :: major, minor
594 character(len=256) :: val_str
603#ifdef __HIP_PLATFORM_AMD__
613#ifdef __HIP_PLATFORM_AMD__
621 cval_str = c_null_char
669 integer(int64) :: hits, misses
670 real(real64) :: volume_hits, volume_misses
686 if (.not. found)
exit
693 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
704 if (hits + misses > 0)
then
705 call messages_write(hits/real(hits + misses, real64)*100, fmt=
'(f6.1)', align_left = .
true.)
711 if (volume_hits + volume_misses > 0)
then
712 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
727 call cublas_end(accel%cublas_handle)
728 if (.not. accel%cuda_mpi)
then
729 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
733 if (buffer_alloc_count /= 0)
then
751 integer(int64),
intent(in) :: n(:)
752 integer(int64),
intent(in) :: blocksizes(:)
753 integer(int64),
intent(out) :: gridsizes(:)
757 dim = ubound(n, dim=1)
758 assert(dim == ubound(blocksizes, dim=1))
759 assert(dim == ubound(gridsizes, dim=1))
762 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
763 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
771 integer,
intent(in) :: n(:)
772 integer,
intent(in) :: blocksizes(:)
773 integer,
intent(out) :: gridsizes(:)
775 integer(int64) :: gridsizes64(size(gridsizes))
779 gridsizes = int(gridsizes64, int32)
786 integer(int64),
intent(in) :: n
787 integer(int64),
intent(in) :: blocksizes
788 integer(int64),
intent(out) :: gridsizes
790 integer(int64) :: temp(1)
801 integer,
intent(in) :: n
802 integer,
intent(in) :: blocksizes
803 integer,
intent(out) :: gridsizes
805 integer(int64) :: temp(1)
809 gridsizes = int(temp(1), int32)
818 integer(int64),
intent(in) :: n
819 integer(int64),
intent(in) :: pack_size
820 integer(int64),
dimension(3),
intent(out) :: gridsizes
821 integer(int64),
dimension(3),
intent(out) :: blocksizes
824 integer(int64) :: bsize, dim2, dim3
825 integer(int64),
dimension(3) :: nn
827 if(
present(kernel))
then
836 nn = (/pack_size, dim2, dim3/)
837 blocksizes = (/pack_size, bsize, 1_int64/)
848 integer,
intent(in) :: n
849 integer,
intent(in) :: pack_size
850 integer,
dimension(3),
intent(out) :: gridsizes
851 integer,
dimension(3),
intent(out) :: blocksizes
854 integer(int64) :: gridsizes64(3), blocksizes64(3)
857 gridsizes64, blocksizes64, kernel=kernel)
859 gridsizes = int(gridsizes64, int32)
860 blocksizes = int(blocksizes64, int32)
866 integer(int64),
intent(in) :: nn
868 integer(int64) :: modnn, bsize
877 modnn = mod(nn, bsize)
878 if (modnn /= 0) psize = psize + bsize - modnn
887 integer(int32),
intent(in) :: nn
897 integer,
intent(in) :: flags
898 type(
type_t),
intent(in) :: type
899 integer,
intent(in) :: size
900 logical,
optional,
intent(in) :: set_zero
901 logical,
optional,
intent(in) :: async
910 integer,
intent(in) :: flags
911 type(
type_t),
intent(in) :: type
912 integer(int64),
intent(in) :: size
913 logical,
optional,
intent(in) :: set_zero
914 logical,
optional,
intent(in) :: async
916 integer(int64) :: fsize
918 integer(int64) :: initialize_buffers
926 this%allocated = .
true.
932 if (.not. found)
then
935 call cuda_mem_alloc_async(this%mem, fsize)
942 buffer_alloc_count = buffer_alloc_count + 1
943 allocated_mem = allocated_mem + fsize
947 if (
present(set_zero))
then
948 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
950 initialize_buffers = accel%initialize_buffers
952 select case (initialize_buffers)
953 case (option__initializegpubuffers__yes)
955 case (option__initializegpubuffers__nan)
966 logical,
optional,
intent(in) :: async
969 integer(int64) :: fsize
973 if (this%size > 0)
then
982 call cuda_mem_free_async(this%mem)
989 buffer_alloc_count = buffer_alloc_count - 1
990 allocated_mem = allocated_mem + fsize
997 this%allocated = .false.
1016 buffer_to%mem = buffer_from%mem
1017 buffer_to%size = buffer_from%size
1018 buffer_to%type = buffer_from%type
1019 buffer_to%flags = buffer_from%flags
1020 buffer_to%allocated = buffer_from%allocated
1038 this%mem = c_null_ptr
1042 this%allocated = .false.
1052 integer,
intent(in) :: flags
1053 type(
type_t),
intent(in) :: type
1054 integer,
intent(in) :: required_size
1055 logical,
intent(in) :: set_zero
1056 logical,
optional,
intent(in) :: async
1077 allocated = this%allocated
1096 integer,
intent(in) :: narg
1118 integer(int64),
intent(in) :: gridsizes(:)
1119 integer(int64),
intent(in) :: blocksizes(:)
1120 integer(int64),
optional,
intent(in) :: shared_memory_size
1123 integer(int64) :: gsizes(1:3)
1124 integer(int64) :: bsizes(1:3)
1132 dim = ubound(gridsizes, dim=1)
1134 assert(dim == ubound(blocksizes, dim=1))
1137 if (any(gridsizes == 0))
return
1139 assert(all(blocksizes > 0))
1141 gsizes(1:dim) = gridsizes(1:dim)
1142 bsizes(1:dim) = blocksizes(1:dim)
1146 if (any(bsizes(1:3) > accel%max_block_dim(1:3)))
then
1147 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1148 message(2) =
"The following conditions should be fulfilled:"
1149 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", bsizes(1),
" <= ", accel%max_block_dim(1)
1150 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", bsizes(2),
" <= ", accel%max_block_dim(2)
1151 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", bsizes(3),
" <= ", accel%max_block_dim(3)
1152 message(6) =
"This is an internal error, please contact the developers."
1159 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1160 message(2) =
"The following condition should be fulfilled:"
1162 message(4) =
"This is an internal error, please contact the developers."
1167 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1168 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1169 message(2) =
"The following conditions should be fulfilled:"
1170 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1171 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1172 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1173 message(6) =
"This is an internal error, please contact the developers."
1177 if(
present(shared_memory_size))
then
1179 if (shared_memory_size > accel%shared_memory_size)
then
1180 message(1) =
"Shared memory too large in kernel "//trim(kernel%kernel_name)
1181 message(2) =
"The following condition should be fulfilled:"
1182 message(3) =
"Requested shared memory <= Available shared memory"
1183 write(
message(4),
'(a,f12.6,a)')
"Requested shared memory: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1184 write(
message(5),
'(a,f12.6,a)')
"Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0,
" Kb"
1185 message(6) =
"This is an internal error, please contact the developers."
1187 else if (shared_memory_size <= 0)
then
1188 message(1) =
"Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1189 write(
message(2),
'(a,f12.6,a)')
"Shared memory size requested: ", real(shared_memory_size, real64) /1024.0,
" Kb"
1190 message(3) =
"This is an internal error, please contact the developers."
1211 integer,
intent(in) :: gridsizes(:)
1212 integer,
intent(in) :: blocksizes(:)
1213 integer(int64),
optional,
intent(in) :: shared_memory_size
1215 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1222 max_block_size = accel%max_block_size
1231 integer :: max_block_size
1237 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1238 if (debug%info .and. max_block_size /=
accel%max_block_size)
then
1239 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", max_block_size,
")", &
1240 "than available on the device (",
accel%max_block_size,
")"
1241 call messages_info(1)
1249 block_size = min(block_size, max_block_size)
1258 type(type_t),
intent(in) :: type
1259 integer(int8),
intent(in) :: val
1260 integer(int64),
intent(in) :: nval
1261 integer(int64),
optional,
intent(in) :: offset
1262 logical,
optional,
intent(in) :: async
1264 integer(int64) :: nval_, offset_, type_size
1274 if (
present(offset))
then
1276 if(offset > buffer%size)
then
1282 type_size = types_get_size(type)
1284 nval_ = nval*type_size
1287 if (
present(offset)) offset_ = offset*type_size
1289 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1290 if(.not. optional_default(async, .false.))
call accel_finish()
1299 type(type_t),
intent(in) :: type
1300 integer(int64),
intent(in) :: nval
1301 integer(int64),
optional,
intent(in) :: offset
1302 logical,
optional,
intent(in) :: async
1315 type(type_t),
intent(in) :: type
1316 integer(int32),
intent(in) :: nval
1317 integer(int32),
optional,
intent(in) :: offset
1318 logical,
optional,
intent(in) :: async
1322 if (
present(offset))
then
1335 integer,
parameter :: times = 10
1337 real(real64) :: time, stime
1338 real(real64) :: read_bw, write_bw
1340 real(real64),
allocatable :: data(:)
1342 call messages_new_line()
1343 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1344 call messages_new_line()
1345 call messages_info()
1347 call messages_write(
' Buffer size Read bw Write bw')
1348 call messages_new_line()
1349 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1350 call messages_info()
1354 safe_allocate(
data(1:size))
1357 stime = loct_clock()
1362 time = (loct_clock() - stime)/real(times, real64)
1364 write_bw = real(
size, real64) *8.0_real64/time
1366 stime = loct_clock()
1372 time = (loct_clock() - stime)/real(times, real64)
1373 read_bw = real(
size, real64) *8.0_real64/time
1375 call messages_write(size*8.0_real64/1024.0_real64**2)
1376 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1377 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1378 call messages_info()
1382 safe_deallocate_a(data)
1384 size = int(size*2.0)
1386 if (
size > 50000000)
exit
1398 call cuda_module_map_init(
accel%module_map)
1411 if (.not.
associated(
head))
exit
1412 next_head =>
head%next
1418 call cuda_module_map_end(
accel%module_map)
1428 character(len=*),
intent(in) :: file_name
1429 character(len=*),
intent(in) :: kernel_name
1430 character(len=*),
optional,
intent(in) :: flags
1433 character(len=1000) :: all_flags
1438 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
1441 all_flags =
'-I'//trim(conf%share)//
'/kernels/'//
" "//trim(
accel%debug_flag)
1443 if (
present(flags))
then
1444 all_flags = trim(all_flags)//
' '//trim(flags)
1447 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
1448 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1450 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1451 call cuda_alloc_arg_array(this%arguments)
1454 this%initialized = .
true.
1455 this%kernel_name = trim(kernel_name)
1457 call profiling_out(
"ACCEL_COMPILE")
1470 call cuda_free_arg_array(this%arguments)
1471 call cuda_release_kernel(this%cuda_kernel)
1475 this%initialized = .false.
1484 character(len=*),
intent(in) :: file_name
1485 character(len=*),
intent(in) :: kernel_name
1486 character(len=*),
optional,
intent(in) :: flags
1490 if (.not. this%initialized)
then
1503 size =
accel%global_memory_size
1511 size =
accel%shared_memory_size
1517 integer,
intent(in) :: dim
1522 if (dim == 1)
size = 2**30
1529 integer,
intent(in) :: stream_number
1535 call cuda_set_stream(
accel%cuda_stream, stream_number)
1536 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
1546 integer,
intent(inout) :: stream_number
1552 call cuda_get_stream(stream_number)
1566 call cuda_synchronize_all_streams()
1574 type(c_ptr),
intent(in) :: buffer
1575 integer(int64),
intent(in) :: offset
1576 type(c_ptr) :: buffer_offset
1580 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1583 buffer_offset = buffer
1589 type(c_ptr),
intent(in) :: buffer
1590 integer(int64),
intent(in) :: offset
1591 type(c_ptr) :: buffer_offset
1595 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1598 buffer_offset = buffer
1604 type(c_ptr),
intent(in) :: buffer
1608 call cuda_clean_pointer(buffer)
1617 integer(int64),
intent(in) :: size
1618 integer(int64),
intent(out) :: grid_size
1619 integer(int64),
intent(out) :: thread_block_size
1622#ifdef __HIP_PLATFORM_AMD__
1625 thread_block_size =
size
1628 thread_block_size =
accel%warp_size
1635#include "accel_inc.F90"
1638#include "complex.F90"
1639#include "accel_inc.F90"
1642#include "integer.F90"
1643#include "accel_inc.F90"
1646#include "integer8.F90"
1647#include "accel_inc.F90"
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
type(accel_kernel_t), target, save, public dkernel_batch_dotp
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
integer function, public accel_kernel_block_size(kernel)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine, public accel_free_buffer(this, async)
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine accel_check_bandwidth()
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine, public accel_move_buffer(buffer_from, buffer_to)
Move the buffer memory from the first buffer to the second.
subroutine, public accel_detach_buffer(this)
Clear a buffer handle without freeing device memory.
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_dotp
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_shared_memory_size()
integer pure function, public accel_max_block_size()
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
System information (time, memory, sysname)
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
type(type_t), parameter, public type_cmplx
integer pure function, public types_get_size(this)
type(type_t), parameter, public type_float
type(type_t), parameter, public type_none
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)