21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
43 use iso_c_binding,
only: c_size_t
44 use,
intrinsic :: iso_fortran_env
104 integer,
public,
parameter :: &
105 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
109 integer,
public,
parameter :: &
110 ACCEL_MEM_READ_ONLY = 0, &
118 type(cl_context) :: cl_context
119#elif defined(HAVE_CUDA)
120 type(c_ptr) :: cuda_context
129 type(cl_device_id) :: cl_device
130#elif defined(HAVE_CUDA)
131 type(c_ptr) :: cuda_device
139 type(accel_context_t) :: context
140 type(accel_device_t) :: device
142 type(cl_command_queue) :: command_queue
144 type(c_ptr) :: cublas_handle
145 type(c_ptr) :: cuda_stream
146 type(c_ptr) :: module_map
147 integer :: max_workgroup_size
148 integer(int64) :: local_memory_size
149 integer(int64) :: global_memory_size
151 logical :: allow_CPU_only
152 logical :: shared_mem
155 integer(int64) :: initialize_buffers
156 character(len=32) :: debug_flag
157 integer(int64) :: max_block_dim(3)
158 integer(int64) :: max_grid_dim(3)
168 integer(c_size_t) :: size = 0
171 logical :: allocated = .false.
177 type(cl_kernel) :: kernel
180 type(c_ptr) :: cuda_kernel
181 type(c_ptr) :: cuda_module
182 type(c_ptr) :: arguments
184 integer(int64) :: cuda_shared_mem
185 logical :: initialized = .false.
186 type(accel_kernel_t),
pointer :: next
188 character(len=128) :: kernel_name
191 type(accel_t),
public :: accel
194 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
195 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
202 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
203 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
204 type(accel_kernel_t),
public,
target,
save :: kernel_copy
205 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
206 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
207 type(accel_kernel_t),
public,
target,
save :: dpack
208 type(accel_kernel_t),
public,
target,
save :: zpack
213 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
214 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
215 type(accel_kernel_t),
public,
target,
save :: kernel_phase
216 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
217 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
218 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
219 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix_spinors
224 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_dotp
225 type(accel_kernel_t),
public,
target,
save :: zkernel_batch_dotp
314 integer,
parameter :: &
321 integer,
parameter :: &
322 CL_PLAT_INVALID = -1, &
329 integer,
public :: cl_status
331 integer :: buffer_alloc_count
332 integer(int64) :: allocated_mem
333 type(accel_kernel_t),
pointer :: head
334 type(alloc_cache_t) :: memcache
340 enabled = accel%enabled
350 allow = accel%allow_CPU_only
359 type(mpi_grp_t),
intent(inout) :: base_grp
360 type(namespace_t),
intent(in) :: namespace
362 logical :: disable, default, run_benchmark
363 integer :: idevice, iplatform
365 integer :: device_type
366 integer :: cl_status, idev
367 integer :: ndevices, ret_devices, nplatforms, iplat
368 character(len=256) :: device_name
369 type(cl_platform_id) :: platform_id
370 type(cl_program) :: prog
371 type(cl_platform_id),
allocatable :: allplatforms(:)
372 type(cl_device_id),
allocatable :: alldevices(:)
373 integer :: max_work_item_dimensions
374 integer(int64),
allocatable :: max_work_item_sizes(:)
379 character(len=256) :: sys_name
385 buffer_alloc_count = 0
403 accel%enabled = .not. disable
406 if (accel%enabled)
then
407 message(1) =
'Octopus was compiled without OpenCL or Cuda support.'
462 call parse_variable(namespace,
'AccelDevice', opencl_gpu, idevice)
474 if (idevice<0) idevice = 0
475 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
476 idevice, base_grp%rank)
479 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
480 " on ", trim(sys_name)
485 accel%shared_mem = .
true.
487 call cublas_init(accel%cublas_handle, accel%cuda_stream)
493 call clgetplatformids(nplatforms, cl_status)
494 if (cl_status /= cl_success)
call opencl_print_error(cl_status,
"GetPlatformIDs")
496 safe_allocate(allplatforms(1:nplatforms))
498 call clgetplatformids(allplatforms, iplat, cl_status)
499 if (cl_status /= cl_success)
call opencl_print_error(cl_status,
"GetPlatformIDs")
505 do iplat = 1, nplatforms
507 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
509 if (iplatform < 0)
then
510 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
513 if (iplatform == iplat - 1)
then
521 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
528 if (iplatform >= nplatforms .or. iplatform < 0)
then
530 if (iplatform > 0)
then
538 platform_id = allplatforms(iplatform + 1)
540 safe_deallocate_a(allplatforms)
542 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
548 safe_allocate(alldevices(1:ndevices))
552 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
554 do idev = 1, ndevices
557 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
562 select case (idevice)
564 device_type = cl_device_type_gpu
566 device_type = cl_device_type_cpu
568 device_type = cl_device_type_accelerator
570 device_type = cl_device_type_default
572 device_type = cl_device_type_all
576 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
578 if (ret_devices < 1)
then
580 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
582 if (ret_devices < 1)
then
584 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
587 if (ret_devices < 1)
then
594 ndevices = ret_devices
596 if (idevice < 0)
then
597 if (base_grp%size > 1)
then
600 call select_device(idevice)
606 if (idevice >= ndevices)
then
607 call messages_write(
'Requested CL device does not exist (device = ')
615 accel%device%cl_device = alldevices(idevice + 1)
618 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
619 if (cl_status /= cl_success)
call opencl_print_error(cl_status,
"CreateContext")
621 safe_deallocate_a(alldevices)
623 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
624 cl_queue_profiling_enable, cl_status)
625 if (cl_status /= cl_success)
call opencl_print_error(cl_status,
"CreateCommandQueue")
627 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
629 select case (device_type)
630 case (cl_device_type_gpu)
631 accel%shared_mem = .
true.
632 case (cl_device_type_cpu, cl_device_type_accelerator)
633 accel%shared_mem = .false.
635 accel%shared_mem = .false.
639 call clblassetup(cl_status)
644 call clfftsetup(cl_status)
655 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
656 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
657 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
659 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_dimensions, max_work_item_dimensions, cl_status)
660 if (max_work_item_dimensions < 3)
then
661 message(1) =
"Octopus requires a device where CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS is at least 3."
664 safe_allocate(max_work_item_sizes(1:max_work_item_dimensions))
665 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_sizes, max_work_item_sizes(1), cl_status)
666 accel%max_block_dim(:) = max_work_item_sizes(1:3)
667 safe_deallocate_a(max_work_item_sizes)
674 accel%max_grid_dim(1) = (2_int64)**31 - 1_int64
675 accel%max_grid_dim(2) = 65536_int64
676 accel%max_grid_dim(3) = 65536_int64
684 accel%max_block_dim(1) = int(dim, int64)
686 accel%max_block_dim(2) = int(dim, int64)
688 accel%max_block_dim(3) = int(dim, int64)
690 accel%max_grid_dim(1) = int(dim, int64)
692 accel%max_grid_dim(2) = int(dim, int64)
694 accel%max_grid_dim(3) = int(dim, int64)
700 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
706 accel%debug_flag =
"-g"
707#elif defined(HAVE_CUDA)
708 accel%debug_flag =
"-lineinfo"
709#elif defined(HAVE_OPENCL)
710 accel%debug_flag =
"-g"
720 flags =
' -DRTYPE_DOUBLE')
722 flags =
'-DRTYPE_COMPLEX')
724 flags =
'-DRTYPE_DOUBLE')
726 flags =
'-DRTYPE_COMPLEX')
776 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
780 if (run_benchmark)
then
801 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
802 if (accel%cuda_mpi)
then
804 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
820#if defined (HAVE_ACCEL)
825 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
841 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
853#if defined(HAVE_OPENCL)
854 subroutine select_device(idevice)
855 integer,
intent(inout) :: idevice
857 character(len=256) :: device_name
861 idevice = mod(base_grp%rank, ndevices)
863 call base_grp%barrier()
866 do irank = 0, base_grp%size - 1
867 if (irank == base_grp%rank)
then
868 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
876 call base_grp%barrier()
880 end subroutine select_device
885 integer(int64) :: val
889 character(kind=c_char) :: cval_str(257)
891 integer :: major, minor
892 character(len=256) :: val_str
904#ifdef __HIP_PLATFORM_AMD__
914#ifdef __HIP_PLATFORM_AMD__
922 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
924 select case (int(val, int32))
925 case (cl_device_type_gpu)
927 case (cl_device_type_cpu)
929 case (cl_device_type_accelerator)
934 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
940 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
943 cval_str = c_null_char
961 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
973 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
978 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
995 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
1000 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
1005 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
1018 call messages_write(f90_cl_device_has_extension(accel%device%cl_device,
"cl_khr_fp64"))
1022 call messages_write(f90_cl_device_has_extension(accel%device%cl_device,
"cl_amd_fp64"))
1026 call messages_write(f90_cl_device_has_extension(accel%device%cl_device,
"cl_khr_int64_base_atomics"))
1041 integer function get_platform_id(platform_name)
result(platform_id)
1042 character(len=*),
intent(in) :: platform_name
1044 platform_id = cl_plat_invalid
1045 if (index(platform_name,
'AMD') > 0) platform_id =
cl_plat_amd
1046 if (index(platform_name,
'ATI') > 0) platform_id =
cl_plat_ati
1047 if (index(platform_name,
'NVIDIA') > 0) platform_id =
cl_plat_nvidia
1048 if (index(platform_name,
'Intel') > 0) platform_id =
cl_plat_intel
1049 end function get_platform_id
1059 integer(int64) :: hits, misses
1060 real(real64) :: volume_hits, volume_misses
1076 if (.not. found)
exit
1079 call clreleasememobject(tmp%mem, ierr)
1080 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clReleaseMemObject")
1087 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1098 if (hits + misses > 0)
then
1105 if (volume_hits + volume_misses > 0)
then
1106 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
1120 call clblasteardown()
1124 call clfftteardown()
1129 call cublas_end(accel%cublas_handle)
1130 if (.not. accel%cuda_mpi)
then
1131 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1136 call clreleasecommandqueue(accel%command_queue, ierr)
1138 if (ierr /= cl_success)
call opencl_print_error(ierr,
"ReleaseCommandQueue")
1139 call clreleasecontext(accel%context%cl_context, cl_status)
1142 if (buffer_alloc_count /= 0)
then
1159 integer(int64),
intent(in) :: nn
1161 integer(int64) :: modnn, bsize
1170 modnn = mod(nn, bsize)
1171 if (modnn /= 0) psize = psize + bsize - modnn
1180 integer(int32),
intent(in) :: nn
1190 integer,
intent(in) :: flags
1191 type(
type_t),
intent(in) :: type
1192 integer,
intent(in) :: size
1193 logical,
optional,
intent(in) :: set_zero
1194 logical,
optional,
intent(in) :: async
1203 integer,
intent(in) :: flags
1205 integer(int64),
intent(in) :: size
1206 logical,
optional,
intent(in) :: set_zero
1207 logical,
optional,
intent(in) :: async
1209 integer(int64) :: fsize
1211 integer(int64) :: initialize_buffers
1222 this%allocated = .
true.
1228 if (.not. found)
then
1230 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1231 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clCreateBuffer")
1235 call cuda_mem_alloc_async(this%mem, fsize)
1242 buffer_alloc_count = buffer_alloc_count + 1
1243 allocated_mem = allocated_mem + fsize
1247 if (
present(set_zero))
then
1248 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
1250 initialize_buffers = accel%initialize_buffers
1252 select case (initialize_buffers)
1253 case (option__initializegpubuffers__yes)
1255 case (option__initializegpubuffers__nan)
1266 logical,
optional,
intent(in) :: async
1272 integer(int64) :: fsize
1276 if (this%size > 0)
then
1284 call clreleasememobject(this%mem, ierr)
1285 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clReleaseMemObject")
1289 call cuda_mem_free_async(this%mem)
1296 buffer_alloc_count = buffer_alloc_count - 1
1297 allocated_mem = allocated_mem + fsize
1304 this%allocated = .false.
1314 integer,
intent(in) :: flags
1315 type(
type_t),
intent(in) :: type
1316 integer,
intent(in) :: required_size
1317 logical,
intent(in) :: set_zero
1318 logical,
optional,
intent(in) :: async
1322 if (buffer%size < required_size)
then
1335 allocated = this%allocated
1349 call clfinish(accel%command_queue, ierr)
1350 if (ierr /= cl_success)
call opencl_print_error(ierr,
'clFinish')
1362 integer,
intent(in) :: narg
1373 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1374 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clSetKernelArg_buf")
1387 integer,
intent(in) :: narg
1388 type(
type_t),
intent(in) :: type
1389 integer,
intent(in) :: size
1394 integer(int64) :: size_in_bytes
1401 if (size_in_bytes > accel%local_memory_size)
then
1402 write(
message(1),
'(a,f12.6,a)')
"CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0,
" Kb"
1403 write(
message(2),
'(a,f12.6,a)')
" available local memory: ", real(accel%local_memory_size, real64) /1024.0,
" Kb"
1405 else if (size_in_bytes <= 0)
then
1406 write(
message(1),
'(a,i10)')
"CL Error: invalid local memory size: ", size_in_bytes
1411 kernel%cuda_shared_mem = size_in_bytes
1415 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1416 if (ierr /= cl_success)
call opencl_print_error(ierr,
"set_kernel_arg_local")
1426 integer(int64),
intent(in) :: globalsizes(:)
1427 integer(int64),
intent(in) :: localsizes(:)
1433 integer(int64) :: gsizes(1:3)
1434 integer(int64) :: lsizes(1:3)
1442 dim = ubound(globalsizes, dim=1)
1444 assert(dim == ubound(localsizes, dim=1))
1447 if (any(globalsizes == 0))
return
1449 assert(all(localsizes > 0))
1451 assert(all(mod(globalsizes, localsizes) == 0))
1453 gsizes(1:dim) = globalsizes(1:dim)
1454 lsizes(1:dim) = localsizes(1:dim)
1457 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1458 if (ierr /= cl_success)
call opencl_print_error(ierr,
"EnqueueNDRangeKernel")
1463 if (any(lsizes(1:3) > accel%max_block_dim(1:3)))
then
1464 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1465 message(2) =
"The following conditions should be fulfilled:"
1466 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", lsizes(1),
" <= ", accel%max_block_dim(1)
1467 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", lsizes(2),
" <= ", accel%max_block_dim(2)
1468 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", lsizes(3),
" <= ", accel%max_block_dim(3)
1469 message(6) =
"This is an internal error, please contact the developers."
1476 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1477 message(2) =
"The following condition should be fulfilled:"
1479 message(4) =
"This is an internal error, please contact the developers."
1483 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1486 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1487 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1488 message(2) =
"The following conditions should be fulfilled:"
1489 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1490 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1491 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1492 message(6) =
"This is an internal error, please contact the developers."
1496 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1498 kernel%cuda_shared_mem = 0
1507 integer,
intent(in) :: globalsizes(:)
1508 integer,
intent(in) :: localsizes(:)
1517 max_workgroup_size = accel%max_workgroup_size
1526 integer(int64) :: workgroup_size8
1530 integer :: max_workgroup_size
1536 call clgetkernelworkgroupinfo(kernel%kernel,
accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1537 if (ierr /= cl_success)
call opencl_print_error(ierr,
"EnqueueNDRangeKernel")
1538 workgroup_size = workgroup_size8
1542 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1543 if (debug%info .and. max_workgroup_size /=
accel%max_workgroup_size)
then
1544 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", workgroup_size,
")", &
1545 "than available on the device (",
accel%max_workgroup_size,
")"
1546 call messages_info(1)
1550 workgroup_size = 256
1552 workgroup_size = min(workgroup_size, max_workgroup_size)
1560 subroutine opencl_build_program(prog, filename, flags)
1561 type(cl_program),
intent(inout) :: prog
1562 character(len=*),
intent(in) :: filename
1563 character(len=*),
optional,
intent(in) :: flags
1565 character(len = 1000) :: string
1566 character(len = 256) :: share_string
1567 integer :: ierr, ierrlog, iunit, irec, newlen
1569 push_sub(opencl_build_program)
1571 string =
'#include "'//trim(filename)//
'"'
1573 call messages_write(
"Building CL program '"//trim(filename)//
"'.")
1574 call messages_info(debug_only=.
true.)
1576 prog = clcreateprogramwithsource(
accel%context%cl_context, trim(string), ierr)
1577 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clCreateProgramWithSource")
1582 string=trim(string)//
' -cl-denorms-are-zero'
1585 string=trim(string)//
' -cl-mad-enable'
1586 string=trim(string)//
' -cl-unsafe-math-optimizations'
1587 string=trim(string)//
' -cl-finite-math-only'
1588 string=trim(string)//
' -cl-fast-relaxed-math'
1590 share_string=
'-I'//trim(conf%share)//
'/opencl/'
1592 if (f90_cl_device_has_extension(
accel%device%cl_device,
"cl_khr_fp64"))
then
1593 string = trim(string)//
' -DEXT_KHR_FP64'
1594 else if (f90_cl_device_has_extension(
accel%device%cl_device,
"cl_amd_fp64"))
then
1595 string = trim(string)//
' -DEXT_AMD_FP64'
1597 call messages_write(
'Octopus requires an OpenCL device with double-precision support.')
1598 call messages_fatal()
1602 string = trim(string)//
' -DSHARED_MEM'
1605 if (
present(flags))
then
1606 string = trim(string)//
' '//trim(flags)
1609 call messages_write(
"Debug info: compilation flags '"//trim(string), new_line = .
true.)
1610 call messages_write(
' '//trim(share_string)//
"'.")
1611 call messages_info(debug_only=.
true.)
1613 string = trim(string)//
' '//trim(share_string)
1615 call clbuildprogram(prog, trim(string), ierr)
1617 if(ierr /= cl_success)
then
1618 call clgetprogrambuildinfo(prog,
accel%device%cl_device, cl_program_build_log, string, ierrlog)
1619 if (ierrlog /= cl_success)
call opencl_print_error(ierrlog,
"clGetProgramBuildInfo")
1622 newlen = scan(string, achar(010), back = .
true.) - 1
1623 if (newlen >= 0) string = string(1:newlen)
1625 if (len(trim(string)) > 0)
write(stderr,
'(a)') trim(string)
1627 call opencl_print_error(ierr,
"clBuildProgram")
1630 pop_sub(opencl_build_program)
1631 end subroutine opencl_build_program
1636 subroutine opencl_release_program(prog)
1637 type(cl_program),
intent(inout) :: prog
1641 push_sub(opencl_release_program)
1643 call clreleaseprogram(prog, ierr)
1644 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clReleaseProgram")
1646 pop_sub(opencl_release_program)
1647 end subroutine opencl_release_program
1653 subroutine opencl_release_kernel(prog)
1654 type(cl_kernel),
intent(inout) :: prog
1658 push_sub(opencl_release_kernel)
1661 call clreleasekernel(prog, ierr)
1662 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clReleaseKernel")
1665 pop_sub(opencl_release_kernel)
1666 end subroutine opencl_release_kernel
1671 subroutine opencl_create_kernel(kernel, prog, name)
1672 type(cl_kernel),
intent(inout) :: kernel
1673 type(cl_program),
intent(inout) :: prog
1674 character(len=*),
intent(in) :: name
1678 push_sub(opencl_create_kernel)
1679 call profiling_in(
"CL_BUILD_KERNEL", exclude = .
true.)
1682 kernel = clcreatekernel(prog, name, ierr)
1683 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clCreateKernel")
1686 call profiling_out(
"CL_BUILD_KERNEL")
1687 pop_sub(opencl_create_kernel)
1688 end subroutine opencl_create_kernel
1693 subroutine opencl_print_error(ierr, name)
1694 integer,
intent(in) :: ierr
1695 character(len=*),
intent(in) :: name
1697 character(len=40) :: errcode
1699 push_sub(opencl_print_error)
1702 case (cl_success); errcode =
'CL_SUCCESS '
1703 case (cl_device_not_found); errcode =
'CL_DEVICE_NOT_FOUND '
1704 case (cl_device_not_available); errcode =
'CL_DEVICE_NOT_AVAILABLE '
1705 case (cl_compiler_not_available); errcode =
'CL_COMPILER_NOT_AVAILABLE '
1706 case (cl_mem_object_allocation_failure); errcode =
'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1707 case (cl_out_of_resources); errcode =
'CL_OUT_OF_RESOURCES '
1708 case (cl_out_of_host_memory); errcode =
'CL_OUT_OF_HOST_MEMORY '
1709 case (cl_profiling_info_not_available); errcode =
'CL_PROFILING_INFO_NOT_AVAILABLE '
1710 case (cl_mem_copy_overlap); errcode =
'CL_MEM_COPY_OVERLAP '
1711 case (cl_image_format_mismatch); errcode =
'CL_IMAGE_FORMAT_MISMATCH '
1712 case (cl_image_format_not_supported); errcode =
'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1713 case (cl_build_program_failure); errcode =
'CL_BUILD_PROGRAM_FAILURE '
1714 case (cl_map_failure); errcode =
'CL_MAP_FAILURE '
1715 case (cl_invalid_value); errcode =
'CL_INVALID_VALUE '
1716 case (cl_invalid_device_type); errcode =
'CL_INVALID_DEVICE_TYPE '
1717 case (cl_invalid_platform); errcode =
'CL_INVALID_PLATFORM '
1718 case (cl_invalid_device); errcode =
'CL_INVALID_DEVICE '
1719 case (cl_invalid_context); errcode =
'CL_INVALID_CONTEXT '
1720 case (cl_invalid_queue_properties); errcode =
'CL_INVALID_QUEUE_PROPERTIES '
1721 case (cl_invalid_command_queue); errcode =
'CL_INVALID_COMMAND_QUEUE '
1722 case (cl_invalid_host_ptr); errcode =
'CL_INVALID_HOST_PTR '
1723 case (cl_invalid_mem_object); errcode =
'CL_INVALID_MEM_OBJECT '
1724 case (cl_invalid_image_format_descriptor); errcode =
'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1725 case (cl_invalid_image_size); errcode =
'CL_INVALID_IMAGE_SIZE '
1726 case (cl_invalid_sampler); errcode =
'CL_INVALID_SAMPLER '
1727 case (cl_invalid_binary); errcode =
'CL_INVALID_BINARY '
1728 case (cl_invalid_build_options); errcode =
'CL_INVALID_BUILD_OPTIONS '
1729 case (cl_invalid_program); errcode =
'CL_INVALID_PROGRAM '
1730 case (cl_invalid_program_executable); errcode =
'CL_INVALID_PROGRAM_EXECUTABLE '
1731 case (cl_invalid_kernel_name); errcode =
'CL_INVALID_KERNEL_NAME '
1732 case (cl_invalid_kernel_definition); errcode =
'CL_INVALID_KERNEL_DEFINITION '
1733 case (cl_invalid_kernel); errcode =
'CL_INVALID_KERNEL '
1734 case (cl_invalid_arg_index); errcode =
'CL_INVALID_ARG_INDEX '
1735 case (cl_invalid_arg_value); errcode =
'CL_INVALID_ARG_VALUE '
1736 case (cl_invalid_arg_size); errcode =
'CL_INVALID_ARG_SIZE '
1737 case (cl_invalid_kernel_args); errcode =
'CL_INVALID_KERNEL_ARGS '
1738 case (cl_invalid_work_dimension); errcode =
'CL_INVALID_WORK_DIMENSION '
1739 case (cl_invalid_work_group_size); errcode =
'CL_INVALID_WORK_GROUP_SIZE '
1740 case (cl_invalid_work_item_size); errcode =
'CL_INVALID_WORK_ITEM_SIZE '
1741 case (cl_invalid_global_offset); errcode =
'CL_INVALID_GLOBAL_OFFSET '
1742 case (cl_invalid_event_wait_list); errcode =
'CL_INVALID_EVENT_WAIT_LIST '
1743 case (cl_invalid_event); errcode =
'CL_INVALID_EVENT '
1744 case (cl_invalid_operation); errcode =
'CL_INVALID_OPERATION '
1745 case (cl_invalid_gl_object); errcode =
'CL_INVALID_GL_OBJECT '
1746 case (cl_invalid_buffer_size); errcode =
'CL_INVALID_BUFFER_SIZE '
1747 case (cl_invalid_mip_level); errcode =
'CL_INVALID_MIP_LEVEL '
1748 case (cl_invalid_global_work_size); errcode =
'CL_INVALID_GLOBAL_WORK_SIZE '
1749 case (cl_platform_not_found_khr); errcode =
'CL_PLATFORM_NOT_FOUND_KHR'
1751 write(errcode,
'(i10)') ierr
1752 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1755 message(1) =
'OpenCL '//trim(name)//
' '//trim(errcode)
1756 call messages_fatal(1)
1758 pop_sub(opencl_print_error)
1759 end subroutine opencl_print_error
1765 integer,
intent(in) :: ierr
1766 character(len=*),
intent(in) :: name
1768 character(len=40) :: errcode
1771#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1773 case (clblassuccess); errcode =
'clblasSuccess'
1774 case (clblasinvalidvalue); errcode =
'clblasInvalidValue'
1775 case (clblasinvalidcommandqueue); errcode =
'clblasInvalidCommandQueue'
1776 case (clblasinvalidcontext); errcode =
'clblasInvalidContext'
1777 case (clblasinvalidmemobject); errcode =
'clblasInvalidMemObject'
1778 case (clblasinvaliddevice); errcode =
'clblasInvalidDevice'
1779 case (clblasinvalideventwaitlist); errcode =
'clblasInvalidEventWaitList'
1780 case (clblasoutofresources); errcode =
'clblasOutOfResources'
1781 case (clblasoutofhostmemory); errcode =
'clblasOutOfHostMemory'
1782 case (clblasinvalidoperation); errcode =
'clblasInvalidOperation'
1783 case (clblascompilernotavailable); errcode =
'clblasCompilerNotAvailable'
1784 case (clblasbuildprogramfailure); errcode =
'clblasBuildProgramFailure'
1785 case (clblasnotimplemented); errcode =
'clblasNotImplemented'
1786 case (clblasnotinitialized); errcode =
'clblasNotInitialized'
1787 case (clblasinvalidmata); errcode =
'clblasInvalidMatA'
1788 case (clblasinvalidmatb); errcode =
'clblasInvalidMatB'
1789 case (clblasinvalidmatc); errcode =
'clblasInvalidMatC'
1790 case (clblasinvalidvecx); errcode =
'clblasInvalidVecX'
1791 case (clblasinvalidvecy); errcode =
'clblasInvalidVecY'
1792 case (clblasinvaliddim); errcode =
'clblasInvalidDim'
1793 case (clblasinvalidleaddima); errcode =
'clblasInvalidLeadDimA'
1794 case (clblasinvalidleaddimb); errcode =
'clblasInvalidLeadDimB'
1795 case (clblasinvalidleaddimc); errcode =
'clblasInvalidLeadDimC'
1796 case (clblasinvalidincx); errcode =
'clblasInvalidIncX'
1797 case (clblasinvalidincy); errcode =
'clblasInvalidIncY'
1798 case (clblasinsufficientmemmata); errcode =
'clblasInsufficientMemMatA'
1799 case (clblasinsufficientmemmatb); errcode =
'clblasInsufficientMemMatB'
1800 case (clblasinsufficientmemmatc); errcode =
'clblasInsufficientMemMatC'
1801 case (clblasinsufficientmemvecx); errcode =
'clblasInsufficientMemVecX'
1802 case (clblasinsufficientmemvecy); errcode =
'clblasInsufficientMemVecY'
1804 case (clblastinsufficientmemorytemp); errcode =
'clblastInsufficientMemoryTemp'
1805 case (clblastinvalidbatchcount); errcode =
'clblastInvalidBatchCount'
1806 case (clblastinvalidoverridekernel); errcode =
'clblastInvalidOverrideKernel'
1807 case (clblastmissingoverrideparameter); errcode =
'clblastMissingOverrideParameter'
1808 case (clblastinvalidlocalmemusage); errcode =
'clblastInvalidLocalMemUsage'
1809 case (clblastnohalfprecision); errcode =
'clblastNoHalfPrecision'
1810 case (clblastnodoubleprecision); errcode =
'clblastNoDoublePrecision'
1811 case (clblastinvalidvectorscalar); errcode =
'clblastInvalidVectorScalar'
1812 case (clblastinsufficientmemoryscalar); errcode =
'clblastInsufficientMemoryScalar'
1813 case (clblastdatabaseerror); errcode =
'clblastDatabaseError'
1814 case (clblastunknownerror); errcode =
'clblastUnknownError'
1815 case (clblastunexpectederror); errcode =
'clblastUnexpectedError'
1819 write(errcode,
'(i10)') ierr
1820 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1824 message(1) =
'Error in calling clblas routine '//trim(name)//
' : '//trim(errcode)
1825 call messages_fatal(1)
1832 integer,
intent(in) :: ierr
1833 character(len=*),
intent(in) :: name
1835 character(len=40) :: errcode
1840 case (clfft_invalid_global_work_size); errcode =
'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1841 case (clfft_invalid_mip_level); errcode =
'CLFFT_INVALID_MIP_LEVEL'
1842 case (clfft_invalid_buffer_size); errcode =
'CLFFT_INVALID_BUFFER_SIZE'
1843 case (clfft_invalid_gl_object); errcode =
'CLFFT_INVALID_GL_OBJECT'
1844 case (clfft_invalid_operation); errcode =
'CLFFT_INVALID_OPERATION'
1845 case (clfft_invalid_event); errcode =
'CLFFT_INVALID_EVENT'
1846 case (clfft_invalid_event_wait_list); errcode =
'CLFFT_INVALID_EVENT_WAIT_LIST'
1847 case (clfft_invalid_global_offset); errcode =
'CLFFT_INVALID_GLOBAL_OFFSET'
1848 case (clfft_invalid_work_item_size); errcode =
'CLFFT_INVALID_WORK_ITEM_SIZE'
1849 case (clfft_invalid_work_group_size); errcode =
'CLFFT_INVALID_WORK_GROUP_SIZE'
1850 case (clfft_invalid_work_dimension); errcode =
'CLFFT_INVALID_WORK_DIMENSION'
1851 case (clfft_invalid_kernel_args); errcode =
'CLFFT_INVALID_KERNEL_ARGS'
1852 case (clfft_invalid_arg_size); errcode =
'CLFFT_INVALID_ARG_SIZE'
1853 case (clfft_invalid_arg_value); errcode =
'CLFFT_INVALID_ARG_VALUE'
1854 case (clfft_invalid_arg_index); errcode =
'CLFFT_INVALID_ARG_INDEX'
1855 case (clfft_invalid_kernel); errcode =
'CLFFT_INVALID_KERNEL'
1856 case (clfft_invalid_kernel_definition); errcode =
'CLFFT_INVALID_KERNEL_DEFINITION'
1857 case (clfft_invalid_kernel_name); errcode =
'CLFFT_INVALID_KERNEL_NAME'
1858 case (clfft_invalid_program_executable); errcode =
'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1859 case (clfft_invalid_program); errcode =
'CLFFT_INVALID_PROGRAM'
1860 case (clfft_invalid_build_options); errcode =
'CLFFT_INVALID_BUILD_OPTIONS'
1861 case (clfft_invalid_binary); errcode =
'CLFFT_INVALID_BINARY'
1862 case (clfft_invalid_sampler); errcode =
'CLFFT_INVALID_SAMPLER'
1863 case (clfft_invalid_image_size); errcode =
'CLFFT_INVALID_IMAGE_SIZE'
1864 case (clfft_invalid_image_format_descriptor); errcode =
'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1865 case (clfft_invalid_mem_object); errcode =
'CLFFT_INVALID_MEM_OBJECT'
1866 case (clfft_invalid_host_ptr); errcode =
'CLFFT_INVALID_HOST_PTR'
1867 case (clfft_invalid_command_queue); errcode =
'CLFFT_INVALID_COMMAND_QUEUE'
1868 case (clfft_invalid_queue_properties); errcode =
'CLFFT_INVALID_QUEUE_PROPERTIES'
1869 case (clfft_invalid_context); errcode =
'CLFFT_INVALID_CONTEXT'
1870 case (clfft_invalid_device); errcode =
'CLFFT_INVALID_DEVICE'
1871 case (clfft_invalid_platform); errcode =
'CLFFT_INVALID_PLATFORM'
1872 case (clfft_invalid_device_type); errcode =
'CLFFT_INVALID_DEVICE_TYPE'
1873 case (clfft_invalid_value); errcode =
'CLFFT_INVALID_VALUE'
1874 case (clfft_map_failure); errcode =
'CLFFT_MAP_FAILURE'
1875 case (clfft_build_program_failure); errcode =
'CLFFT_BUILD_PROGRAM_FAILURE'
1876 case (clfft_image_format_not_supported); errcode =
'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1877 case (clfft_image_format_mismatch); errcode =
'CLFFT_IMAGE_FORMAT_MISMATCH'
1878 case (clfft_mem_copy_overlap); errcode =
'CLFFT_MEM_COPY_OVERLAP'
1879 case (clfft_profiling_info_not_available); errcode =
'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1880 case (clfft_out_of_host_memory); errcode =
'CLFFT_OUT_OF_HOST_MEMORY'
1881 case (clfft_out_of_resources); errcode =
'CLFFT_OUT_OF_RESOURCES'
1882 case (clfft_mem_object_allocation_failure); errcode =
'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1883 case (clfft_compiler_not_available); errcode =
'CLFFT_COMPILER_NOT_AVAILABLE'
1884 case (clfft_device_not_available); errcode =
'CLFFT_DEVICE_NOT_AVAILABLE'
1885 case (clfft_device_not_found); errcode =
'CLFFT_DEVICE_NOT_FOUND'
1886 case (clfft_success); errcode =
'CLFFT_SUCCESS'
1887 case (clfft_bugcheck); errcode =
'CLFFT_BUGCHECK'
1888 case (clfft_notimplemented); errcode =
'CLFFT_NOTIMPLEMENTED'
1889 case (clfft_file_not_found); errcode =
'CLFFT_FILE_NOT_FOUND'
1890 case (clfft_file_create_failure); errcode =
'CLFFT_FILE_CREATE_FAILURE'
1891 case (clfft_version_mismatch); errcode =
'CLFFT_VERSION_MISMATCH'
1892 case (clfft_invalid_plan); errcode =
'CLFFT_INVALID_PLAN'
1893 case (clfft_device_no_double); errcode =
'CLFFT_DEVICE_NO_DOUBLE'
1894 case (clfft_endstatus); errcode =
'CLFFT_ENDSTATUS'
1896 write(errcode,
'(i10)') ierr
1897 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1901 message(1) =
'clfft '//trim(name)//
' '//trim(errcode)
1902 call messages_fatal(1)
1910 logical function f90_cl_device_has_extension(device, extension)
result(has)
1911 type(cl_device_id),
intent(inout) :: device
1912 character(len=*),
intent(in) :: extension
1914 integer :: cl_status
1915 character(len=2048) :: all_extensions
1918 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1921 has = index(all_extensions, extension) /= 0
1923 end function f90_cl_device_has_extension
1930 type(type_t),
intent(in) :: type
1931 integer(int8),
intent(in) :: val
1932 integer(int64),
intent(in) :: nval
1933 integer(int64),
optional,
intent(in) :: offset
1934 logical,
optional,
intent(in) :: async
1939 integer(int64) :: nval_, offset_, type_size
1949 if (
present(offset))
then
1951 if(offset > buffer%size)
then
1957 type_size = types_get_size(type)
1959 nval_ = nval*type_size
1962 if (
present(offset)) offset_ = offset*type_size
1965 call clenqueuefillbuffer(
accel%command_queue, buffer%mem, val, offset_, nval_, ierr)
1966 if (ierr /= cl_success)
call opencl_print_error(ierr,
"clEnqueueFillBuffer")
1968 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1970 if(.not. optional_default(async, .false.))
call accel_finish()
1979 type(type_t),
intent(in) :: type
1980 integer(int64),
intent(in) :: nval
1981 integer(int64),
optional,
intent(in) :: offset
1982 logical,
optional,
intent(in) :: async
1995 type(type_t),
intent(in) :: type
1996 integer(int32),
intent(in) :: nval
1997 integer(int32),
optional,
intent(in) :: offset
1998 logical,
optional,
intent(in) :: async
2002 if (
present(offset))
then
2015 integer,
parameter :: times = 10
2017 real(real64) :: time, stime
2018 real(real64) :: read_bw, write_bw
2020 real(real64),
allocatable :: data(:)
2022 call messages_new_line()
2023 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
2024 call messages_new_line()
2025 call messages_info()
2027 call messages_write(
' Buffer size Read bw Write bw')
2028 call messages_new_line()
2029 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
2030 call messages_info()
2034 safe_allocate(
data(1:size))
2037 stime = loct_clock()
2042 time = (loct_clock() - stime)/real(times, real64)
2044 write_bw = real(
size, real64) *8.0_real64/time
2046 stime = loct_clock()
2052 time = (loct_clock() - stime)/real(times, real64)
2053 read_bw = real(
size, real64) *8.0_real64/time
2055 call messages_write(size*8.0_real64/1024.0_real64**2)
2056 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
2057 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
2058 call messages_info()
2062 safe_deallocate_a(data)
2064 size = int(size*2.0)
2066 if (
size > 50000000)
exit
2072 logical pure function accel_use_shared_mem() result(use_shared_mem)
2074 use_shared_mem =
accel%shared_mem
2086 call cuda_module_map_init(
accel%module_map)
2099 if (.not.
associated(
head))
exit
2100 next_head =>
head%next
2106 call cuda_module_map_end(
accel%module_map)
2116 character(len=*),
intent(in) :: file_name
2117 character(len=*),
intent(in) :: kernel_name
2118 character(len=*),
optional,
intent(in) :: flags
2121 type(cl_program) :: prog
2124 character(len=1000) :: all_flags
2129 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
2132 all_flags =
'-I'//trim(conf%share)//
'/opencl/'//
" "//trim(
accel%debug_flag)
2135 all_flags = trim(all_flags)//
' -DSHARED_MEM'
2138 if (
present(flags))
then
2139 all_flags = trim(all_flags)//
' '//trim(flags)
2142 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
2143 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
2145 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
2146 call cuda_alloc_arg_array(this%arguments)
2148 this%cuda_shared_mem = 0
2152 call opencl_build_program(prog, trim(conf%share)//
'/opencl/'//trim(file_name), flags = flags)
2153 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2154 call opencl_release_program(prog)
2157 this%initialized = .
true.
2158 this%kernel_name = trim(kernel_name)
2160 call profiling_out(
"ACCEL_COMPILE")
2176 call cuda_free_arg_array(this%arguments)
2177 call cuda_release_kernel(this%cuda_kernel)
2182 call clreleasekernel(this%kernel, ierr)
2183 if (ierr /= cl_success)
call opencl_print_error(ierr,
"release_kernel")
2185 this%initialized = .false.
2194 character(len=*),
intent(in) :: file_name
2195 character(len=*),
intent(in) :: kernel_name
2196 character(len=*),
optional,
intent(in) :: flags
2200 if (.not. this%initialized)
then
2211 integer(int64) pure function accel_global_memory_size() result(size)
2213 size =
accel%global_memory_size
2219 integer(int64) pure function accel_local_memory_size() result(size)
2221 size =
accel%local_memory_size
2227 integer pure function accel_max_size_per_dim(dim) result(size)
2228 integer,
intent(in) :: dim
2234 if (dim == 1)
size = 2**30
2238 if (dim == 1)
size = 2**30
2245 integer,
intent(in) :: stream_number
2251 call cuda_set_stream(
accel%cuda_stream, stream_number)
2252 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
2262 integer,
intent(inout) :: stream_number
2268 call cuda_get_stream(stream_number)
2282 call cuda_synchronize_all_streams()
2290 type(c_ptr),
intent(in) :: buffer
2291 integer(int64),
intent(in) :: offset
2292 type(c_ptr) :: buffer_offset
2296 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2299 buffer_offset = buffer
2305 type(c_ptr),
intent(in) :: buffer
2306 integer(int64),
intent(in) :: offset
2307 type(c_ptr) :: buffer_offset
2311 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2314 buffer_offset = buffer
2320 type(c_ptr),
intent(in) :: buffer
2324 call cuda_clean_pointer(buffer)
2333 integer(int64),
intent(in) :: size
2334 integer(int64),
intent(out) :: grid_size
2335 integer(int64),
intent(out) :: thread_block_size
2338#ifdef __HIP_PLATFORM_AMD__
2341 thread_block_size =
size
2343 grid_size =
size *
accel%warp_size
2344 thread_block_size =
accel%warp_size
2351#include "accel_inc.F90"
2354#include "complex.F90"
2355#include "accel_inc.F90"
2358#include "integer.F90"
2359#include "accel_inc.F90"
2362#include "integer8.F90"
2363#include "accel_inc.F90"
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
integer, parameter opencl_accelerator
type(accel_kernel_t), target, save, public kernel_density_real
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer, parameter opencl_default
type(accel_kernel_t), target, save, public kernel_vpsi_complex
type(accel_kernel_t), target, save, public dkernel_batch_axpy
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
logical pure function, public accel_use_shared_mem()
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
type(accel_kernel_t), target, save, public kernel_ghost_reorder
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
integer, parameter cl_plat_nvidia
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
integer, parameter cl_plat_ati
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine opencl_check_bandwidth()
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
type(accel_kernel_t), target, save, public zzmul
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine, public clfft_print_error(ierr, name)
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public dzmul
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine, public accel_release_buffer(this, async)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
type(accel_kernel_t), target, save, public zunpack
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
integer, parameter cl_plat_amd
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
integer, parameter cl_plat_intel
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public kernel_vpsi
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
integer function, public accel_kernel_workgroup_size(kernel)
integer, parameter opencl_cpu
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
subroutine, public clblas_print_error(ierr, name)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public dunpack
integer(int64) pure function, public accel_local_memory_size()
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
integer pure function, public accel_max_workgroup_size()
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
type(type_t), public type_float
type(type_t), public type_cmplx
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)