Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
46 use math_oct_m
48 use mpi_oct_m
50 use types_oct_m
51 use parser_oct_m
55 use string_oct_m
56
57 implicit none
58
59 private
60
61 public :: &
66 accel_t, &
69 accel_init, &
70 accel_end, &
102
103#ifdef HAVE_OPENCL
104 integer, public, parameter :: &
105 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
106 accel_mem_read_write = cl_mem_read_write, &
107 accel_mem_write_only = cl_mem_write_only
108#else
109 integer, public, parameter :: &
110 ACCEL_MEM_READ_ONLY = 0, &
113#endif
114
115 type accel_context_t
116 ! Components are public by default
117#ifdef HAVE_OPENCL
118 type(cl_context) :: cl_context
119#elif defined(HAVE_CUDA)
120 type(c_ptr) :: cuda_context
121#else
122 integer :: dummy
123#endif
124 end type accel_context_t
125
126 type accel_device_t
127 ! Components are public by default
128#ifdef HAVE_OPENCL
129 type(cl_device_id) :: cl_device
130#elif defined(HAVE_CUDA)
131 type(c_ptr) :: cuda_device
132#else
133 integer :: dummy
134#endif
135 end type accel_device_t
136
137 type accel_t
138 ! Components are public by default
139 type(accel_context_t) :: context
140 type(accel_device_t) :: device
141#ifdef HAVE_OPENCL
142 type(cl_command_queue) :: command_queue
143#endif
144 type(c_ptr) :: cublas_handle
145 type(c_ptr) :: cuda_stream
146 type(c_ptr) :: module_map
147 integer :: max_workgroup_size
148 integer(int64) :: local_memory_size
149 integer(int64) :: global_memory_size
150 logical :: enabled
151 logical :: allow_CPU_only
152 logical :: shared_mem
153 logical :: cuda_mpi
154 integer :: warp_size
155 integer(int64) :: initialize_buffers
156 character(len=32) :: debug_flag
157 integer(int64) :: max_block_dim(3)
158 integer(int64) :: max_grid_dim(3)
159 end type accel_t
160
161 type accel_mem_t
162 ! Components are public by default
163#ifdef HAVE_OPENCL
164 type(cl_mem) :: mem
165#else
166 type(c_ptr) :: mem
167#endif
168 integer(c_size_t) :: size = 0
169 type(type_t) :: type
170 integer :: flags = 0
171 logical :: allocated = .false.
172 end type accel_mem_t
173
174 type accel_kernel_t
175 ! Components are public by default
176#ifdef HAVE_OPENCL
177 type(cl_kernel) :: kernel
178#endif
179#ifdef HAVE_CUDA
180 type(c_ptr) :: cuda_kernel
181 type(c_ptr) :: cuda_module
182 type(c_ptr) :: arguments
183#endif
184 integer(int64) :: cuda_shared_mem
185 logical :: initialized = .false.
186 type(accel_kernel_t), pointer :: next
187 integer :: arg_count
188 character(len=128) :: kernel_name
189 end type accel_kernel_t
190
191 type(accel_t), public :: accel
192
193 ! Global variables defined on device
194 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
195 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
197 ! the kernels
198 type(accel_kernel_t), public, target, save :: kernel_vpsi
199 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
200 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
201 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
202 type(accel_kernel_t), public, target, save :: kernel_daxpy
203 type(accel_kernel_t), public, target, save :: kernel_zaxpy
204 type(accel_kernel_t), public, target, save :: kernel_copy
205 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
206 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
207 type(accel_kernel_t), public, target, save :: dpack
208 type(accel_kernel_t), public, target, save :: zpack
209 type(accel_kernel_t), public, target, save :: dunpack
210 type(accel_kernel_t), public, target, save :: zunpack
211 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
212 type(accel_kernel_t), public, target, save :: kernel_density_real
213 type(accel_kernel_t), public, target, save :: kernel_density_complex
214 type(accel_kernel_t), public, target, save :: kernel_density_spinors
215 type(accel_kernel_t), public, target, save :: kernel_phase
216 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
217 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
218 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
219 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
220 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
221 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
222 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
223 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
224 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
225 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
226 type(accel_kernel_t), public, target, save :: dzmul
227 type(accel_kernel_t), public, target, save :: zzmul
228
229 interface accel_padded_size
231 end interface accel_padded_size
239 end interface accel_kernel_run
259 end interface accel_write_buffer
260
270 end interface accel_read_buffer
272 interface accel_set_kernel_arg
273 module procedure &
298 module procedure &
306 module procedure &
312
314 integer, parameter :: &
315 OPENCL_GPU = -1, &
316 opencl_cpu = -2, &
318 opencl_default = -4
319
320
321 integer, parameter :: &
322 CL_PLAT_INVALID = -1, &
323 cl_plat_amd = -2, &
324 cl_plat_nvidia = -3, &
326 cl_plat_intel = -5
327
328 ! a "convenience" public variable
329 integer, public :: cl_status
330
331 integer :: buffer_alloc_count
332 integer(int64) :: allocated_mem
333 type(accel_kernel_t), pointer :: head
334 type(alloc_cache_t) :: memcache
335
336contains
337
338 pure logical function accel_is_enabled() result(enabled)
339#ifdef HAVE_ACCEL
340 enabled = accel%enabled
341#else
342 enabled = .false.
343#endif
344 end function accel_is_enabled
345
346 ! ------------------------------------------
347
348 pure logical function accel_allow_cpu_only() result(allow)
349#ifdef HAVE_ACCEL
350 allow = accel%allow_CPU_only
351#else
352 allow = .true.
353#endif
354 end function accel_allow_cpu_only
355
356 ! ------------------------------------------
357
358 subroutine accel_init(base_grp, namespace)
359 type(mpi_grp_t), intent(inout) :: base_grp
360 type(namespace_t), intent(in) :: namespace
361
362 logical :: disable, default, run_benchmark
363 integer :: idevice, iplatform
364#ifdef HAVE_OPENCL
365 integer :: device_type
366 integer :: cl_status, idev
367 integer :: ndevices, ret_devices, nplatforms, iplat
368 character(len=256) :: device_name
369 type(cl_platform_id) :: platform_id
370 type(cl_program) :: prog
371 type(cl_platform_id), allocatable :: allplatforms(:)
372 type(cl_device_id), allocatable :: alldevices(:)
373 integer :: max_work_item_dimensions
374 integer(int64), allocatable :: max_work_item_sizes(:)
375#endif
376#ifdef HAVE_CUDA
377 integer :: dim
378#ifdef HAVE_MPI
379 character(len=256) :: sys_name
380#endif
381#endif
382
383 push_sub(accel_init)
384
385 buffer_alloc_count = 0
386
387 !%Variable DisableAccel
388 !%Type logical
389 !%Default yes
390 !%Section Execution::Accel
391 !%Description
392 !% If Octopus was compiled with OpenCL or CUDA support, it will
393 !% try to initialize and use an accelerator device. By setting this
394 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
395 !%End
396 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
397#ifdef HAVE_ACCEL
398 default = .false.
399#else
400 default = .true.
401#endif
402 call parse_variable(namespace, 'DisableAccel', default, disable)
403 accel%enabled = .not. disable
404
405#ifndef HAVE_ACCEL
406 if (accel%enabled) then
407 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
408 call messages_fatal(1)
409 end if
410#endif
412 if (.not. accel_is_enabled()) then
413 pop_sub(accel_init)
414 return
415 end if
416
417 !%Variable AccelPlatform
418 !%Type integer
419 !%Default 0
420 !%Section Execution::Accel
421 !%Description
422 !% This variable selects the OpenCL platform that Octopus will
423 !% use. You can give an explicit platform number or use one of
424 !% the options that select a particular vendor
425 !% implementation. Platform 0 is used by default.
426 !%
427 !% This variable has no effect for CUDA.
428 !%Option amd -2
429 !% Use the AMD OpenCL platform.
430 !%Option nvidia -3
431 !% Use the Nvidia OpenCL platform.
432 !%Option ati -4
433 !% Use the ATI (old AMD) OpenCL platform.
434 !%Option intel -5
435 !% Use the Intel OpenCL platform.
436 !%End
437 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
439 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
440
441 !%Variable AccelDevice
442 !%Type integer
443 !%Default gpu
444 !%Section Execution::Accel
445 !%Description
446 !% This variable selects the OpenCL or CUDA accelerator device
447 !% that Octopus will use. You can specify one of the options below
448 !% or a numerical id to select a specific device.
449 !%
450 !% Values >= 0 select the device to be used. In case of MPI enabled runs
451 !% devices are distributed in a round robin fashion, starting at this value.
452 !%Option gpu -1
453 !% If available, Octopus will use a GPU.
454 !%Option cpu -2
455 !% If available, Octopus will use a CPU (only for OpenCL).
456 !%Option accelerator -3
457 !% If available, Octopus will use an accelerator (only for OpenCL).
458 !%Option accel_default -4
459 !% Octopus will use the default device specified by the implementation.
460 !% implementation.
461 !%End
462 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
463
464 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
465
466 if (idevice < opencl_default) then
467 call messages_write('Invalid AccelDevice')
468 call messages_fatal()
469 end if
470
471 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
472
473#ifdef HAVE_CUDA
474 if (idevice<0) idevice = 0
475 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
476 idevice, base_grp%rank)
477#ifdef HAVE_MPI
478 call loct_sysname(sys_name)
479 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
480 " on ", trim(sys_name)
481 call messages_info(1, all_nodes = .true.)
482#endif
483
484 ! no shared mem support in our cuda interface (for the moment)
485 accel%shared_mem = .true.
486
487 call cublas_init(accel%cublas_handle, accel%cuda_stream)
488#endif
489
490#ifdef HAVE_OPENCL
491 call profiling_in('CL_INIT')
492
493 call clgetplatformids(nplatforms, cl_status)
494 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
495
496 safe_allocate(allplatforms(1:nplatforms))
497
498 call clgetplatformids(allplatforms, iplat, cl_status)
499 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
500
501 call messages_write('Info: Available CL platforms: ')
502 call messages_write(nplatforms)
503 call messages_info()
504
505 do iplat = 1, nplatforms
506
507 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
508
509 if (iplatform < 0) then
510 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
511 end if
512
513 if (iplatform == iplat - 1) then
514 call messages_write(' * Platform ')
515 else
516 call messages_write(' Platform ')
517 end if
518
519 call messages_write(iplat - 1)
520 call messages_write(' : '//device_name)
521 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
522 call messages_write(' ('//trim(device_name)//')')
523 call messages_info()
524 end do
525
526 call messages_info()
527
528 if (iplatform >= nplatforms .or. iplatform < 0) then
529 call messages_write('Requested CL platform does not exist')
530 if (iplatform > 0) then
531 call messages_write('(platform = ')
532 call messages_write(iplatform)
533 call messages_write(').')
534 end if
535 call messages_fatal()
536 end if
537
538 platform_id = allplatforms(iplatform + 1)
539
540 safe_deallocate_a(allplatforms)
541
542 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
543
544 call messages_write('Info: Available CL devices: ')
545 call messages_write(ndevices)
546 call messages_info()
547
548 safe_allocate(alldevices(1:ndevices))
549
550 ! list all devices
551
552 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
553
554 do idev = 1, ndevices
555 call messages_write(' Device ')
556 call messages_write(idev - 1)
557 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
558 call messages_write(' : '//device_name)
559 call messages_info()
560 end do
561
562 select case (idevice)
563 case (opencl_gpu)
564 device_type = cl_device_type_gpu
565 case (opencl_cpu)
566 device_type = cl_device_type_cpu
567 case (opencl_accelerator)
568 device_type = cl_device_type_accelerator
569 case (opencl_default)
570 device_type = cl_device_type_default
571 case default
572 device_type = cl_device_type_all
573 end select
574
575 ! now get a list of the selected type
576 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
577
578 if (ret_devices < 1) then
579 ! we didnt find a device of the selected type, we ask for the default device
580 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
581
582 if (ret_devices < 1) then
583 ! if this does not work, we ask for all devices
584 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
585 end if
586
587 if (ret_devices < 1) then
588 call messages_write('Cannot find an OpenCL device')
589 call messages_fatal()
590 end if
591 end if
592
593 ! the number of devices can be smaller
594 ndevices = ret_devices
595
596 if (idevice < 0) then
597 if (base_grp%size > 1) then
598 ! with MPI we have to select the device so multiple GPUs in one
599 ! node are correctly distributed
600 call select_device(idevice)
601 else
602 idevice = 0
603 end if
604 end if
605
606 if (idevice >= ndevices) then
607 call messages_write('Requested CL device does not exist (device = ')
608 call messages_write(idevice)
609 call messages_write(', platform = ')
610 call messages_write(iplatform)
611 call messages_write(').')
612 call messages_fatal()
613 end if
614
615 accel%device%cl_device = alldevices(idevice + 1)
616
617 ! create the context
618 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
619 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
620
621 safe_deallocate_a(alldevices)
622
623 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
624 cl_queue_profiling_enable, cl_status)
625 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
626
627 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
628
629 select case (device_type)
630 case (cl_device_type_gpu)
631 accel%shared_mem = .true.
632 case (cl_device_type_cpu, cl_device_type_accelerator)
633 accel%shared_mem = .false.
634 case default
635 accel%shared_mem = .false.
636 end select
637
638#ifdef HAVE_CLBLAS
639 call clblassetup(cl_status)
640 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
641#endif
642
643#ifdef HAVE_CLFFT
644 call clfftsetup(cl_status)
645 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
646#endif
647
648 call profiling_out('CL_INIT')
649#endif
650
651 ! Get some device information that we will need later
652
653 ! total memory
654#ifdef HAVE_OPENCL
655 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
656 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
657 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
658 accel%warp_size = 1
659 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_dimensions, max_work_item_dimensions, cl_status)
660 if (max_work_item_dimensions < 3) then
661 message(1) = "Octopus requires a device where CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS is at least 3."
662 call messages_fatal(1, only_root_writes = .true., namespace=namespace)
663 end if
664 safe_allocate(max_work_item_sizes(1:max_work_item_dimensions))
665 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_sizes, max_work_item_sizes(1), cl_status)
666 accel%max_block_dim(:) = max_work_item_sizes(1:3)
667 safe_deallocate_a(max_work_item_sizes)
668 ! In principle OpenCL does not set any limits on the global_work_size. It is
669 ! only limited by the available resources. Therefore we use the default
670 ! values for NVIDIA GPUs starting at CC 5.0. No idea whether these will work
671 ! generically.
672 ! https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features
673 ! -and-technical-specifications-technical-specifications-per-compute-capability
674 accel%max_grid_dim(1) = (2_int64)**31 - 1_int64
675 accel%max_grid_dim(2) = 65536_int64
676 accel%max_grid_dim(3) = 65536_int64
677#endif
678#ifdef HAVE_CUDA
679 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
680 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
681 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
682 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
683 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
684 accel%max_block_dim(1) = int(dim, int64)
685 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
686 accel%max_block_dim(2) = int(dim, int64)
687 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
688 accel%max_block_dim(3) = int(dim, int64)
689 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
690 accel%max_grid_dim(1) = int(dim, int64)
691 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
692 accel%max_grid_dim(2) = int(dim, int64)
693 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
694 accel%max_grid_dim(3) = int(dim, int64)
695#endif
696
697 if (base_grp%is_root()) call device_info()
698
699 ! initialize the cache used to speed up allocations
700 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
701
702 ! now initialize the kernels
704
705#if defined(HAVE_HIP)
706 accel%debug_flag = "-g"
707#elif defined(HAVE_CUDA)
708 accel%debug_flag = "-lineinfo"
709#elif defined(HAVE_OPENCL)
710 accel%debug_flag = "-g"
711#endif
712
713 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
714 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
715 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
716 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
717 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
718 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
719 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
720 flags = ' -DRTYPE_DOUBLE')
721 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
722 flags = '-DRTYPE_COMPLEX')
723 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
724 flags = '-DRTYPE_DOUBLE')
725 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
726 flags = '-DRTYPE_COMPLEX')
727 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
728 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
729 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
730 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
731 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
732 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
733 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
734 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cl', "copy_complex_to_real")
735 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cl', "copy_real_to_complex")
736 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
737 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
738 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
739 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
740 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
741 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
742 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
743 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
744
745
746 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
747 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
748
749 ! Define global buffers
750 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
751 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
752 call accel_write_buffer(zm_0_buffer, m_z0)
753 end if
754 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
755 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
756 call accel_write_buffer(zm_1_buffer, m_z1)
757 end if
758 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
759 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
760 call accel_write_buffer(dm_0_buffer, m_zero)
761 end if
762 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
763 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
764 call accel_write_buffer(dm_1_buffer, m_one)
765 end if
766
767
768 !%Variable AccelBenchmark
769 !%Type logical
770 !%Default no
771 !%Section Execution::Accel
772 !%Description
773 !% If this variable is set to yes, Octopus will run some
774 !% routines to benchmark the performance of the accelerator device.
775 !%End
776 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
777
778 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
779
780 if (run_benchmark) then
782 end if
783
784 !%Variable GPUAwareMPI
785 !%Type logical
786 !%Section Execution::Accel
787 !%Description
788 !% If Octopus was compiled with GPU support and MPI support and if the MPI
789 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
790 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
791 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
792 !% going through the host memory.
793 !% The default is false, except when the configure switch --enable-cudampi is set, in which
794 !% case this variable is set to true.
795 !%End
796#ifdef HAVE_CUDA_MPI
797 default = .true.
798#else
799 default = .false.
800#endif
801 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
802 if (accel%cuda_mpi) then
803#ifndef HAVE_CUDA_MPI
804 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
805 call messages_warning()
806#endif
807 call messages_write("Using GPU-aware MPI.")
808 call messages_info()
809 end if
810
811
812 !%Variable AllowCPUonly
813 !%Type logical
814 !%Section Execution::Accel
815 !%Description
816 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
817 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
818 !% code execution also in these cases.
819 !%End
820#if defined (HAVE_ACCEL)
821 default = .false.
822#else
823 default = .true.
824#endif
825 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
826
827
828 !%Variable InitializeGPUBuffers
829 !%Type integer
830 !%Default no
831 !%Section Execution::Accel
832 !%Description
833 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
834 !%Option no 0
835 !% Do not initialize GPU buffers.
836 !%Option yes 1
837 !% Initialize GPU buffers to zero.
838 !%Option nan 2
839 !% Initialize GPU buffers to nan.
840 !%End
841 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
842 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
843 call messages_input_error(namespace, 'InitializeGPUBuffers')
844 end if
845
846
847 call messages_print_with_emphasis(namespace=namespace)
848
849 pop_sub(accel_init)
850
851 contains
852
853#if defined(HAVE_OPENCL)
854 subroutine select_device(idevice)
855 integer, intent(inout) :: idevice
856 integer :: irank
857 character(len=256) :: device_name
858
859 push_sub(accel_init.select_device)
860
861 idevice = mod(base_grp%rank, ndevices)
862
863 call base_grp%barrier()
864 call messages_write('Info: CL device distribution:')
865 call messages_info()
866 do irank = 0, base_grp%size - 1
867 if (irank == base_grp%rank) then
868 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
869 call messages_write(' MPI node ')
870 call messages_write(base_grp%rank)
871 call messages_write(' -> CL device ')
872 call messages_write(idevice)
873 call messages_write(' : '//device_name)
874 call messages_info(all_nodes = .true.)
875 end if
876 call base_grp%barrier()
877 end do
878
879 pop_sub(accel_init.select_device)
880 end subroutine select_device
881#endif
882
883 subroutine device_info()
884#ifdef HAVE_OPENCL
885 integer(int64) :: val
886#endif
887#ifdef HAVE_CUDA
888 integer :: version
889 character(kind=c_char) :: cval_str(257)
890#endif
891 integer :: major, minor
892 character(len=256) :: val_str
893
894 push_sub(accel_init.device_info)
895
896 call messages_new_line()
897 call messages_write('Selected device:')
898 call messages_new_line()
899
900#ifdef HAVE_OPENCL
901 call messages_write(' Framework : OpenCL')
902#endif
903#ifdef HAVE_CUDA
904#ifdef __HIP_PLATFORM_AMD__
905 call messages_write(' Framework : ROCm')
906#else
907 call messages_write(' Framework : CUDA')
908#endif
909#endif
910 call messages_info()
911
912#ifdef HAVE_CUDA
913 call messages_write(' Device type : GPU', new_line = .true.)
914#ifdef __HIP_PLATFORM_AMD__
915 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
916#else
917 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
918#endif
919#endif
920
921#ifdef HAVE_OPENCL
922 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
923 call messages_write(' Device type :')
924 select case (int(val, int32))
925 case (cl_device_type_gpu)
926 call messages_write(' GPU')
927 case (cl_device_type_cpu)
928 call messages_write(' CPU')
929 case (cl_device_type_accelerator)
930 call messages_write(' accelerator')
931 end select
932 call messages_new_line()
933
934 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
935 call messages_write(' Device vendor : '//trim(val_str))
936 call messages_new_line()
937#endif
938
939#ifdef HAVE_OPENCL
940 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
941#endif
942#ifdef HAVE_CUDA
943 cval_str = c_null_char
944 call cuda_device_name(accel%device%cuda_device, cval_str)
945 call string_c_to_f(cval_str, val_str)
946#endif
947 call messages_write(' Device name : '//trim(val_str))
948 call messages_new_line()
949
950#ifdef HAVE_CUDA
951 call cuda_device_capability(accel%device%cuda_device, major, minor)
952#endif
953 call messages_write(' Cuda capabilities :')
954 call messages_write(major, fmt = '(i2)')
955 call messages_write('.')
956 call messages_write(minor, fmt = '(i1)')
958
959 ! VERSION
960#ifdef HAVE_OPENCL
961 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
962 call messages_write(' Driver version : '//trim(val_str))
963#endif
964#ifdef HAVE_CUDA
965 call cuda_driver_version(version)
966 call messages_write(' Driver version : ')
967 call messages_write(version)
968#endif
969 call messages_new_line()
970
971
972#ifdef HAVE_OPENCL
973 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
974 call messages_write(' Compute units :')
975 call messages_write(val)
976 call messages_new_line()
977
978 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
979 call messages_write(' Clock frequency :')
980 call messages_write(val)
981 call messages_write(' GHz')
982 call messages_new_line()
983#endif
984
985 call messages_write(' Device memory :')
986 call messages_write(accel%global_memory_size, units=unit_megabytes)
987 call messages_new_line()
988
989 call messages_write(' Local/shared memory :')
990 call messages_write(accel%local_memory_size, units=unit_kilobytes)
991 call messages_new_line()
992
993
994#ifdef HAVE_OPENCL
995 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
996 call messages_write(' Max alloc size :')
997 call messages_write(val, units = unit_megabytes)
998 call messages_new_line()
999
1000 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
1001 call messages_write(' Device cache :')
1002 call messages_write(val, units = unit_kilobytes)
1003 call messages_new_line()
1004
1005 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
1006 call messages_write(' Constant memory :')
1007 call messages_write(val, units = unit_kilobytes)
1008 call messages_new_line()
1009#endif
1010
1011 call messages_write(' Max. group/block size :')
1012 call messages_write(accel%max_workgroup_size)
1013 call messages_new_line()
1014
1015
1016#ifdef HAVE_OPENCL
1017 call messages_write(' Extension cl_khr_fp64 :')
1018 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
1019 call messages_new_line()
1020
1021 call messages_write(' Extension cl_amd_fp64 :')
1022 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
1023 call messages_new_line()
1024
1025 call messages_write(' Extension cl_khr_int64_base_atomics :')
1026 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
1027 call messages_new_line()
1028
1029#endif
1030
1031 call messages_info()
1032
1033
1034 pop_sub(accel_init.device_info)
1035 end subroutine device_info
1036
1037 end subroutine accel_init
1038
1039 ! ------------------------------------------
1040#ifdef HAVE_OPENCL
1041 integer function get_platform_id(platform_name) result(platform_id)
1042 character(len=*), intent(in) :: platform_name
1043
1044 platform_id = cl_plat_invalid
1045 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
1046 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
1047 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
1048 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
1049 end function get_platform_id
1050#endif
1051 ! ------------------------------------------
1052
1053 subroutine accel_end(namespace)
1054 type(namespace_t), intent(in) :: namespace
1055
1056#ifdef HAVE_OPENCL
1057 integer :: ierr
1058#endif
1059 integer(int64) :: hits, misses
1060 real(real64) :: volume_hits, volume_misses
1061 logical :: found
1062 type(accel_mem_t) :: tmp
1063
1064 push_sub(accel_end)
1065
1066 if (accel_is_enabled()) then
1067
1068 ! Release global buffers
1069 call accel_release_buffer(zm_0_buffer)
1070 call accel_release_buffer(zm_1_buffer)
1071 call accel_release_buffer(dm_0_buffer)
1072 call accel_release_buffer(dm_1_buffer)
1073
1074 do
1075 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
1076 if (.not. found) exit
1077
1078#ifdef HAVE_OPENCL
1079 call clreleasememobject(tmp%mem, ierr)
1080 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1081#endif
1082#ifdef HAVE_CUDA
1083 call cuda_mem_free(tmp%mem)
1084#endif
1085 end do
1086
1087 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1088
1089 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1090
1091 call messages_new_line()
1092 call messages_write(' Number of allocations =')
1093 call messages_write(hits + misses, new_line = .true.)
1094 call messages_write(' Volume of allocations =')
1095 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1096 new_line = .true.)
1097 call messages_write(' Hit ratio =')
1098 if (hits + misses > 0) then
1099 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1100 else
1101 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1102 end if
1103 call messages_write('%', new_line = .true.)
1104 call messages_write(' Volume hit ratio =')
1105 if (volume_hits + volume_misses > 0) then
1106 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1107 else
1108 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1109 end if
1110 call messages_write('%')
1111 call messages_new_line()
1112 call messages_info()
1113
1114 call messages_print_with_emphasis(namespace=namespace)
1115 end if
1116
1118
1119#ifdef HAVE_CLBLAS
1120 call clblasteardown()
1121#endif
1122
1123#ifdef HAVE_CLFFT
1124 call clfftteardown()
1125#endif
1127 if (accel_is_enabled()) then
1128#ifdef HAVE_CUDA
1129 call cublas_end(accel%cublas_handle)
1130 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1131 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1132 end if
1133#endif
1134
1135#ifdef HAVE_OPENCL
1136 call clreleasecommandqueue(accel%command_queue, ierr)
1137
1138 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1139 call clreleasecontext(accel%context%cl_context, cl_status)
1140#endif
1142 if (buffer_alloc_count /= 0) then
1143 call messages_write('Accel:')
1144 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1145 call messages_write(' in ')
1146 call messages_write(buffer_alloc_count)
1147 call messages_write(' buffers were not deallocated.')
1148 call messages_fatal()
1149 end if
1150
1151 end if
1152
1153 pop_sub(accel_end)
1154 end subroutine accel_end
1155
1156 ! ------------------------------------------
1157
1158 integer(int64) function accel_padded_size_i8(nn) result(psize)
1159 integer(int64), intent(in) :: nn
1160
1161 integer(int64) :: modnn, bsize
1162
1163 psize = nn
1164
1165 if (accel_is_enabled()) then
1166
1167 bsize = accel_max_workgroup_size()
1168
1169 psize = nn
1170 modnn = mod(nn, bsize)
1171 if (modnn /= 0) psize = psize + bsize - modnn
1172
1173 end if
1174
1175 end function accel_padded_size_i8
1176
1177 ! ------------------------------------------
1178
1179 integer(int32) function accel_padded_size_i4(nn) result(psize)
1180 integer(int32), intent(in) :: nn
1181
1182 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1183
1184 end function accel_padded_size_i4
1185
1186 ! ------------------------------------------
1187
1188 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
1189 type(accel_mem_t), intent(inout) :: this
1190 integer, intent(in) :: flags
1191 type(type_t), intent(in) :: type
1192 integer, intent(in) :: size
1193 logical, optional, intent(in) :: set_zero
1194 logical, optional, intent(in) :: async
1195
1196 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
1197 end subroutine accel_create_buffer_4
1198
1199 ! ------------------------------------------
1200
1201 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
1202 type(accel_mem_t), intent(inout) :: this
1203 integer, intent(in) :: flags
1204 type(type_t), intent(in) :: type
1205 integer(int64), intent(in) :: size
1206 logical, optional, intent(in) :: set_zero
1207 logical, optional, intent(in) :: async
1209 integer(int64) :: fsize
1210 logical :: found
1211 integer(int64) :: initialize_buffers
1212#ifdef HAVE_OPENCL
1213 integer :: ierr
1214#endif
1215
1216 push_sub(accel_create_buffer_8)
1217
1218 this%type = type
1219 this%size = size
1220 this%flags = flags
1221 fsize = int(size, int64)*types_get_size(type)
1222 this%allocated = .true.
1223
1224 if (fsize > 0) then
1225
1226 call alloc_cache_get(memcache, fsize, found, this%mem)
1227
1228 if (.not. found) then
1229#ifdef HAVE_OPENCL
1230 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1231 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1232#endif
1233#ifdef HAVE_CUDA
1234 if(optional_default(async, .false.)) then
1235 call cuda_mem_alloc_async(this%mem, fsize)
1236 else
1237 call cuda_mem_alloc(this%mem, fsize)
1238 end if
1239#endif
1240 end if
1241
1242 buffer_alloc_count = buffer_alloc_count + 1
1243 allocated_mem = allocated_mem + fsize
1245 end if
1246
1247 if (present(set_zero)) then
1248 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
1249 else
1250 initialize_buffers = accel%initialize_buffers
1251 end if
1252 select case (initialize_buffers)
1253 case (option__initializegpubuffers__yes)
1254 call accel_set_buffer_to(this, type, int(z'00', int8), size)
1255 case (option__initializegpubuffers__nan)
1256 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
1257 end select
1258
1259 pop_sub(accel_create_buffer_8)
1260 end subroutine accel_create_buffer_8
1261
1262 ! ------------------------------------------
1263
1264 subroutine accel_release_buffer(this, async)
1265 type(accel_mem_t), intent(inout) :: this
1266 logical, optional, intent(in) :: async
1267
1268#ifdef HAVE_OPENCL
1269 integer :: ierr
1270#endif
1271 logical :: put
1272 integer(int64) :: fsize
1273
1274 push_sub(accel_release_buffer)
1275
1276 if (this%size > 0) then
1277
1278 fsize = int(this%size, int64)*types_get_size(this%type)
1279
1280 call alloc_cache_put(memcache, fsize, this%mem, put)
1281
1282 if (.not. put) then
1283#ifdef HAVE_OPENCL
1284 call clreleasememobject(this%mem, ierr)
1285 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1286#endif
1287#ifdef HAVE_CUDA
1288 if (optional_default(async, .false.)) then
1289 call cuda_mem_free_async(this%mem)
1290 else
1291 call cuda_mem_free(this%mem)
1292 end if
1293#endif
1294 end if
1295
1296 buffer_alloc_count = buffer_alloc_count - 1
1297 allocated_mem = allocated_mem + fsize
1298
1299 end if
1300
1301 this%size = 0
1302 this%flags = 0
1304 this%allocated = .false.
1305
1306 pop_sub(accel_release_buffer)
1307 end subroutine accel_release_buffer
1308
1309 ! ------------------------------------------------------
1310
1311 ! Check if the temporary buffers are the right size, if not reallocate them
1312 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1313 type(accel_mem_t), intent(inout) :: buffer
1314 integer, intent(in) :: flags
1315 type(type_t), intent(in) :: type
1316 integer, intent(in) :: required_size
1317 logical, intent(in) :: set_zero
1318 logical, optional, intent(in) :: async
1319
1320 push_sub(accel_ensure_buffer_size)
1321
1322
1323 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
1324 call accel_release_buffer(buffer, async=optional_default(async, .false.))
1325 end if
1326
1327 if (.not. accel_buffer_is_allocated(buffer)) then
1328 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1329 end if
1330
1332 end subroutine accel_ensure_buffer_size
1333
1334 ! ------------------------------------------
1335
1336 logical pure function accel_buffer_is_allocated(this) result(allocated)
1337 type(accel_mem_t), intent(in) :: this
1338
1339 allocated = this%allocated
1340 end function accel_buffer_is_allocated
1341
1342 ! -----------------------------------------
1343
1344 subroutine accel_finish()
1345#ifdef HAVE_OPENCL
1346 integer :: ierr
1347#endif
1348
1349 ! no push_sub, called too frequently
1350
1351 if (accel_is_enabled()) then
1352#ifdef HAVE_OPENCL
1353 call clfinish(accel%command_queue, ierr)
1354 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1355#endif
1356#ifdef HAVE_CUDA
1358#endif
1359 end if
1360 end subroutine accel_finish
1361
1362 ! ------------------------------------------
1363
1364 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1365 type(accel_kernel_t), intent(inout) :: kernel
1366 integer, intent(in) :: narg
1367 type(accel_mem_t), intent(in) :: buffer
1368
1369#ifdef HAVE_OPENCL
1370 integer :: ierr
1371#endif
1372
1373 assert(accel_buffer_is_allocated(buffer))
1374
1375 ! no push_sub, called too frequently
1376#ifdef HAVE_OPENCL
1377 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1378 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1379#endif
1381#ifdef HAVE_CUDA
1382 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1383#endif
1384
1385 end subroutine accel_set_kernel_arg_buffer
1386
1387 ! ------------------------------------------
1388
1389 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1390 type(accel_kernel_t), intent(inout) :: kernel
1391 integer, intent(in) :: narg
1392 type(type_t), intent(in) :: type
1393 integer, intent(in) :: size
1394
1395#ifdef HAVE_OPENCL
1396 integer :: ierr
1397#endif
1398 integer(int64) :: size_in_bytes
1401
1402
1403 size_in_bytes = int(size, int64)*types_get_size(type)
1404
1405 if (size_in_bytes > accel%local_memory_size) then
1406 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1407 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1408 call messages_fatal(2)
1409 else if (size_in_bytes <= 0) then
1410 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1411 call messages_fatal(1)
1412 end if
1413
1414#ifdef HAVE_CUDA
1415 kernel%cuda_shared_mem = size_in_bytes
1416#endif
1417
1418#ifdef HAVE_OPENCL
1419 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1420 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1421#endif
1422
1425
1426 ! ------------------------------------------
1427
1428 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1429 type(accel_kernel_t), intent(inout) :: kernel
1430 integer(int64), intent(in) :: globalsizes(:)
1431 integer(int64), intent(in) :: localsizes(:)
1432
1433 integer :: dim
1434#ifdef HAVE_OPENCL
1435 integer :: ierr
1436#endif
1437 integer(int64) :: gsizes(1:3)
1438 integer(int64) :: lsizes(1:3)
1439
1440 ! no push_sub, called too frequently
1442 ! cuda needs all dimensions
1443 gsizes = 1
1444 lsizes = 1
1445
1446 dim = ubound(globalsizes, dim=1)
1447
1448 assert(dim == ubound(localsizes, dim=1))
1449
1450 ! if one size is zero, there is nothing to do
1451 if (any(globalsizes == 0)) return
1452
1453 assert(all(localsizes > 0))
1454 assert(all(localsizes <= accel_max_workgroup_size()))
1455 assert(all(mod(globalsizes, localsizes) == 0))
1456
1457 gsizes(1:dim) = globalsizes(1:dim)
1458 lsizes(1:dim) = localsizes(1:dim)
1459
1460#ifdef HAVE_OPENCL
1461 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1462 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1463#endif
1464
1465#ifdef HAVE_CUDA
1466 ! Maximum dimension of a block
1467 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1468 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1469 message(2) = "The following conditions should be fulfilled:"
1470 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1471 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1472 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1473 message(6) = "This is an internal error, please contact the developers."
1474 call messages_fatal(6)
1475 end if
1476
1477
1478 ! Maximum number of threads per block
1479 if (product(lsizes) > accel_max_workgroup_size()) then
1480 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1481 message(2) = "The following condition should be fulfilled:"
1482 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1483 message(4) = "This is an internal error, please contact the developers."
1485 end if
1486
1487 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1488
1489 ! Maximum dimensions of the grid of thread block
1490 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1491 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1492 message(2) = "The following conditions should be fulfilled:"
1493 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1494 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1495 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1496 message(6) = "This is an internal error, please contact the developers."
1497 call messages_fatal(6)
1498 end if
1500 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1501
1502 kernel%cuda_shared_mem = 0
1503#endif
1504
1505 end subroutine accel_kernel_run_8
1506
1507 ! -----------------------------------------------
1508
1509 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1510 type(accel_kernel_t), intent(inout) :: kernel
1511 integer, intent(in) :: globalsizes(:)
1512 integer, intent(in) :: localsizes(:)
1513
1514 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1515
1516 end subroutine accel_kernel_run_4
1517
1518 ! -----------------------------------------------
1519
1520 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1521 max_workgroup_size = accel%max_workgroup_size
1522 end function accel_max_workgroup_size
1523
1524 ! -----------------------------------------------
1525
1526 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1527 type(accel_kernel_t), intent(inout) :: kernel
1528
1529#ifdef HAVE_OPENCL
1530 integer(int64) :: workgroup_size8
1531 integer :: ierr
1532#endif
1533#ifdef HAVE_CUDA
1534 integer :: max_workgroup_size
1535#endif
1536
1537 workgroup_size = 0
1538
1539#ifdef HAVE_OPENCL
1540 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1541 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1542 workgroup_size = workgroup_size8
1543#endif
1544
1545#ifdef HAVE_CUDA
1546 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1547 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1548 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1549 "than available on the device (", accel%max_workgroup_size, ")"
1550 call messages_info(1)
1551 end if
1552 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1553 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1554 workgroup_size = 256
1555 ! make sure we do not use more threads per block than available for this kernel
1556 workgroup_size = min(workgroup_size, max_workgroup_size)
1557#endif
1558
1559 end function accel_kernel_workgroup_size
1560
1561 ! -----------------------------------------------
1562
1563#ifdef HAVE_OPENCL
1564 subroutine opencl_build_program(prog, filename, flags)
1565 type(cl_program), intent(inout) :: prog
1566 character(len=*), intent(in) :: filename
1567 character(len=*), optional, intent(in) :: flags
1568
1569 character(len = 1000) :: string
1570 character(len = 256) :: share_string
1571 integer :: ierr, ierrlog, iunit, irec, newlen
1572
1573 push_sub(opencl_build_program)
1574
1575 string = '#include "'//trim(filename)//'"'
1576
1577 call messages_write("Building CL program '"//trim(filename)//"'.")
1578 call messages_info(debug_only=.true.)
1579
1580 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1581 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1582
1583 ! build the compilation flags
1584 string='-w'
1585 ! full optimization
1586 string=trim(string)//' -cl-denorms-are-zero'
1587 ! The following flag gives an error with the Xeon Phi
1588 ! string=trim(string)//' -cl-strict-aliasing'
1589 string=trim(string)//' -cl-mad-enable'
1590 string=trim(string)//' -cl-unsafe-math-optimizations'
1591 string=trim(string)//' -cl-finite-math-only'
1592 string=trim(string)//' -cl-fast-relaxed-math'
1593
1594 share_string='-I'//trim(conf%share)//'/opencl/'
1595
1596 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1597 string = trim(string)//' -DEXT_KHR_FP64'
1598 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1599 string = trim(string)//' -DEXT_AMD_FP64'
1600 else
1601 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1602 call messages_fatal()
1603 end if
1604
1605 if (accel_use_shared_mem()) then
1606 string = trim(string)//' -DSHARED_MEM'
1607 end if
1608
1609 if (present(flags)) then
1610 string = trim(string)//' '//trim(flags)
1611 end if
1612
1613 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1614 call messages_write(' '//trim(share_string)//"'.")
1615 call messages_info(debug_only=.true.)
1616
1617 string = trim(string)//' '//trim(share_string)
1618
1619 call clbuildprogram(prog, trim(string), ierr)
1620
1621 if(ierr /= cl_success) then
1622 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1623 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1624
1625 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1626 newlen = scan(string, achar(010), back = .true.) - 1
1627 if (newlen >= 0) string = string(1:newlen)
1628
1629 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1630
1631 call opencl_print_error(ierr, "clBuildProgram")
1632 end if
1633
1634 pop_sub(opencl_build_program)
1635 end subroutine opencl_build_program
1636#endif
1637
1638 ! -----------------------------------------------
1639#ifdef HAVE_OPENCL
1640 subroutine opencl_release_program(prog)
1641 type(cl_program), intent(inout) :: prog
1642
1643 integer :: ierr
1644
1645 push_sub(opencl_release_program)
1646
1647 call clreleaseprogram(prog, ierr)
1648 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1650 pop_sub(opencl_release_program)
1651 end subroutine opencl_release_program
1652#endif
1653
1654 ! -----------------------------------------------
1655
1656#ifdef HAVE_OPENCL
1657 subroutine opencl_release_kernel(prog)
1658 type(cl_kernel), intent(inout) :: prog
1659
1660 integer :: ierr
1661
1662 push_sub(opencl_release_kernel)
1663
1664#ifdef HAVE_OPENCL
1665 call clreleasekernel(prog, ierr)
1666 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1667#endif
1669 pop_sub(opencl_release_kernel)
1670 end subroutine opencl_release_kernel
1671#endif
1672
1673#ifdef HAVE_OPENCL
1674 ! -----------------------------------------------
1675 subroutine opencl_create_kernel(kernel, prog, name)
1676 type(cl_kernel), intent(inout) :: kernel
1677 type(cl_program), intent(inout) :: prog
1678 character(len=*), intent(in) :: name
1679
1680 integer :: ierr
1681
1682 push_sub(opencl_create_kernel)
1683 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1684
1685#ifdef HAVE_OPENCL
1686 kernel = clcreatekernel(prog, name, ierr)
1687 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1688#endif
1689
1690 call profiling_out("CL_BUILD_KERNEL")
1691 pop_sub(opencl_create_kernel)
1692 end subroutine opencl_create_kernel
1693#endif
1694
1695 ! ------------------------------------------------
1696#ifdef HAVE_OPENCL
1697 subroutine opencl_print_error(ierr, name)
1698 integer, intent(in) :: ierr
1699 character(len=*), intent(in) :: name
1700
1701 character(len=40) :: errcode
1702
1703 push_sub(opencl_print_error)
1704
1705 select case (ierr)
1706 case (cl_success); errcode = 'CL_SUCCESS '
1707 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1708 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1709 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1710 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1711 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1712 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1713 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1714 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1715 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1716 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1717 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1718 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1719 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1720 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1721 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1722 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1723 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1724 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1725 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1726 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1727 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1728 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1729 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1730 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1731 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1732 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1733 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1734 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1735 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1736 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1737 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1738 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1739 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1740 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1741 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1742 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1743 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1744 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1745 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1746 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1747 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1748 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1749 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1750 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1751 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1752 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1753 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1754 case default
1755 write(errcode, '(i10)') ierr
1756 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1757 end select
1758
1759 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1760 call messages_fatal(1)
1762 pop_sub(opencl_print_error)
1763 end subroutine opencl_print_error
1764#endif
1765
1766 ! ----------------------------------------------------
1767
1768 subroutine clblas_print_error(ierr, name)
1769 integer, intent(in) :: ierr
1770 character(len=*), intent(in) :: name
1771
1772 character(len=40) :: errcode
1773
1774 push_sub(clblas_print_error)
1775#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1776 select case (ierr)
1777 case (clblassuccess); errcode = 'clblasSuccess'
1778 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1779 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1780 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1781 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1782 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1783 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1784 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1785 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1786 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1787 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1788 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1789 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1790 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1791 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1792 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1793 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1794 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1795 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1796 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1797 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1798 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1799 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1800 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1801 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1802 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1803 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1804 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1805 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1806 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1807#ifdef HAVE_CLBLAST
1808 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1809 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1810 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1811 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1812 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1813 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1814 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1815 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1816 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1817 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1818 case (clblastunknownerror); errcode = 'clblastUnknownError'
1819 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1820#endif
1821
1822 case default
1823 write(errcode, '(i10)') ierr
1824 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1825 end select
1826#endif
1827
1828 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1829 call messages_fatal(1)
1830
1831 pop_sub(clblas_print_error)
1832 end subroutine clblas_print_error
1833
1834 ! ----------------------------------------------------
1835 subroutine clfft_print_error(ierr, name)
1836 integer, intent(in) :: ierr
1837 character(len=*), intent(in) :: name
1838
1839 character(len=40) :: errcode
1840
1841 push_sub(clfft_print_error)
1842#ifdef HAVE_CLFFT
1843 select case (ierr)
1844 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1845 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1846 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1847 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1848 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1849 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1850 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1851 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1852 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1853 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1854 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1855 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1856 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1857 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1858 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1859 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1860 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1861 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1862 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1863 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1864 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1865 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1866 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1867 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1868 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1869 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1870 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1871 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1872 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1873 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1874 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1875 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1876 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1877 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1878 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1879 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1880 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1881 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1882 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1883 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1884 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1885 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1886 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1887 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1888 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1889 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1890 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1891 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1892 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1893 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1894 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1895 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1896 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1897 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1898 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1899 case default
1900 write(errcode, '(i10)') ierr
1901 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1902 end select
1903#endif
1904
1905 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1906 call messages_fatal(1)
1907
1908 pop_sub(clfft_print_error)
1909 end subroutine clfft_print_error
1910
1911 ! ----------------------------------------------------
1912
1913#ifdef HAVE_OPENCL
1914 logical function f90_cl_device_has_extension(device, extension) result(has)
1915 type(cl_device_id), intent(inout) :: device
1916 character(len=*), intent(in) :: extension
1918 integer :: cl_status
1919 character(len=2048) :: all_extensions
1920
1921#ifdef HAVE_OPENCL
1922 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1923#endif
1924
1925 has = index(all_extensions, extension) /= 0
1926
1927 end function f90_cl_device_has_extension
1928#endif
1929
1930 ! ----------------------------------------------------
1931
1932 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1933 type(accel_mem_t), intent(inout) :: buffer
1934 type(type_t), intent(in) :: type
1935 integer(int8), intent(in) :: val
1936 integer(int64), intent(in) :: nval
1937 integer(int64), optional, intent(in) :: offset
1938 logical, optional, intent(in) :: async
1939
1940#ifdef HAVE_OPENCL
1941 integer :: ierr
1942#endif
1943 integer(int64) :: nval_, offset_, type_size
1944
1945 push_sub(accel_set_buffer_to)
1946
1947 if (nval == 0) then
1948 pop_sub(accel_set_buffer_to)
1949 return
1950 end if
1951 assert(nval > 0)
1952
1953 if (present(offset)) then
1954 assert(offset >= 0)
1955 if(offset > buffer%size) then
1956 pop_sub(accel_set_buffer_to)
1957 return
1958 end if
1959 end if
1961 type_size = types_get_size(type)
1962
1963 nval_ = nval*type_size
1964
1965 offset_ = 0_int64
1966 if (present(offset)) offset_ = offset*type_size
1967
1968#ifdef HAVE_OPENCL
1969 call clenqueuefillbuffer(accel%command_queue, buffer%mem, val, offset_, nval_, ierr)
1970 if (ierr /= cl_success) call opencl_print_error(ierr, "clEnqueueFillBuffer")
1971#else
1972 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1973#endif
1974 if(.not. optional_default(async, .false.)) call accel_finish()
1975
1976 pop_sub(accel_set_buffer_to)
1977 end subroutine accel_set_buffer_to
1978
1979 ! ----------------------------------------------------
1980
1981 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1982 type(accel_mem_t), intent(inout) :: buffer
1983 type(type_t), intent(in) :: type
1984 integer(int64), intent(in) :: nval
1985 integer(int64), optional, intent(in) :: offset
1986 logical, optional, intent(in) :: async
1987
1989
1990 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1991
1993 end subroutine accel_set_buffer_to_zero_i8
1994
1995 ! ----------------------------------------------------
1996
1997 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1998 type(accel_mem_t), intent(inout) :: buffer
1999 type(type_t), intent(in) :: type
2000 integer(int32), intent(in) :: nval
2001 integer(int32), optional, intent(in) :: offset
2002 logical, optional, intent(in) :: async
2003
2005
2006 if (present(offset)) then
2007 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
2008 else
2009 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
2010 end if
2011
2013 end subroutine accel_set_buffer_to_zero_i4
2014
2015 ! ----------------------------------------------------
2016
2017 subroutine opencl_check_bandwidth()
2018 integer :: itime
2019 integer, parameter :: times = 10
2020 integer :: size
2021 real(real64) :: time, stime
2022 real(real64) :: read_bw, write_bw
2023 type(accel_mem_t) :: buff
2024 real(real64), allocatable :: data(:)
2025
2026 call messages_new_line()
2027 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
2028 call messages_new_line()
2029 call messages_info()
2030
2031 call messages_write(' Buffer size Read bw Write bw')
2032 call messages_new_line()
2033 call messages_write(' [MiB] [MiB/s] [MiB/s]')
2034 call messages_info()
2035
2036 size = 15000
2037 do
2038 safe_allocate(data(1:size))
2039 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
2040
2041 stime = loct_clock()
2042 do itime = 1, times
2043 call accel_write_buffer(buff, size, data)
2044 call accel_finish()
2045 end do
2046 time = (loct_clock() - stime)/real(times, real64)
2048 write_bw = real(size, real64) *8.0_real64/time
2049
2050 stime = loct_clock()
2051 do itime = 1, times
2052 call accel_read_buffer(buff, size, data)
2053 end do
2054 call accel_finish()
2055
2056 time = (loct_clock() - stime)/real(times, real64)
2057 read_bw = real(size, real64) *8.0_real64/time
2058
2059 call messages_write(size*8.0_real64/1024.0_real64**2)
2060 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
2061 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
2062 call messages_info()
2063
2065
2066 safe_deallocate_a(data)
2067
2068 size = int(size*2.0)
2069
2070 if (size > 50000000) exit
2071 end do
2072 end subroutine opencl_check_bandwidth
2073
2074 ! ----------------------------------------------------
2075
2076 logical pure function accel_use_shared_mem() result(use_shared_mem)
2077
2078 use_shared_mem = accel%shared_mem
2079
2080 end function accel_use_shared_mem
2081
2082 !------------------------------------------------------------
2083
2084 subroutine accel_kernel_global_init()
2085
2086 push_sub(accel_kernel_global_init)
2087
2088 nullify(head)
2089
2090 call cuda_module_map_init(accel%module_map)
2091
2093 end subroutine accel_kernel_global_init
2094
2095 !------------------------------------------------------------
2096
2097 subroutine accel_kernel_global_end()
2098 type(accel_kernel_t), pointer :: next_head
2099
2100 push_sub(accel_kernel_global_end)
2101
2102 do
2103 if (.not. associated(head)) exit
2104 next_head => head%next
2106 head => next_head
2107 end do
2108
2109 if (accel_is_enabled()) then
2110 call cuda_module_map_end(accel%module_map)
2111 end if
2112
2114 end subroutine accel_kernel_global_end
2115
2116 !------------------------------------------------------------
2117
2118 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
2119 type(accel_kernel_t), intent(inout) :: this
2120 character(len=*), intent(in) :: file_name
2121 character(len=*), intent(in) :: kernel_name
2122 character(len=*), optional, intent(in) :: flags
2123
2124#ifdef HAVE_OPENCL
2125 type(cl_program) :: prog
2126#endif
2127#ifdef HAVE_CUDA
2128 character(len=1000) :: all_flags
2129#endif
2130
2131 push_sub(accel_kernel_build)
2133 call profiling_in("ACCEL_COMPILE", exclude = .true.)
2134
2135#ifdef HAVE_CUDA
2136 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
2137
2138 if (accel_use_shared_mem()) then
2139 all_flags = trim(all_flags)//' -DSHARED_MEM'
2140 end if
2141
2142 if (present(flags)) then
2143 all_flags = trim(all_flags)//' '//trim(flags)
2144 end if
2145
2146 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
2147 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
2148
2149 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
2150 call cuda_alloc_arg_array(this%arguments)
2151
2152 this%cuda_shared_mem = 0
2153#endif
2154
2155#ifdef HAVE_OPENCL
2156 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2157 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2158 call opencl_release_program(prog)
2159#endif
2160
2161 this%initialized = .true.
2162 this%kernel_name = trim(kernel_name)
2163
2164 call profiling_out("ACCEL_COMPILE")
2165
2166 pop_sub(accel_kernel_build)
2167 end subroutine accel_kernel_build
2168
2169 !------------------------------------------------------------
2170
2171 subroutine accel_kernel_end(this)
2172 type(accel_kernel_t), intent(inout) :: this
2173#ifdef HAVE_OPENCL
2174 integer :: ierr
2175#endif
2176
2178
2179#ifdef HAVE_CUDA
2180 call cuda_free_arg_array(this%arguments)
2181 call cuda_release_kernel(this%cuda_kernel)
2182 ! modules are not released here, since they are not associated to a kernel
2183#endif
2184
2185#ifdef HAVE_OPENCL
2186 call clreleasekernel(this%kernel, ierr)
2187 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2188#endif
2189 this%initialized = .false.
2190
2191 pop_sub(accel_kernel_end)
2192 end subroutine accel_kernel_end
2193
2194 !------------------------------------------------------------
2195
2196 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2197 type(accel_kernel_t), target, intent(inout) :: this
2198 character(len=*), intent(in) :: file_name
2199 character(len=*), intent(in) :: kernel_name
2200 character(len=*), optional, intent(in) :: flags
2201
2202 push_sub(accel_kernel_start_call)
2203
2204 if (.not. this%initialized) then
2205 call accel_kernel_build(this, file_name, kernel_name, flags)
2206 this%next => head
2207 head => this
2208 end if
2209
2211 end subroutine accel_kernel_start_call
2212
2213 !--------------------------------------------------------------
2214
2215 integer(int64) pure function accel_global_memory_size() result(size)
2216
2217 size = accel%global_memory_size
2218
2219 end function accel_global_memory_size
2220
2221 !--------------------------------------------------------------
2222
2223 integer(int64) pure function accel_local_memory_size() result(size)
2224
2225 size = accel%local_memory_size
2226
2228
2229 !--------------------------------------------------------------
2230
2231 integer pure function accel_max_size_per_dim(dim) result(size)
2232 integer, intent(in) :: dim
2233
2234 size = 0
2235#ifdef HAVE_OPENCL
2236 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2237 ! value of zero when multiplied by 2048 and converted to integer 4.
2238 if (dim == 1) size = 2**30
2239#endif
2240#ifdef HAVE_CUDA
2241 size = 32768
2242 if (dim == 1) size = 2**30
2243#endif
2244 end function accel_max_size_per_dim
2245
2246 ! ------------------------------------------------------
2247
2248 subroutine accel_set_stream(stream_number)
2249 integer, intent(in) :: stream_number
2250
2251 push_sub(accel_set_stream)
2252
2253 if (accel_is_enabled()) then
2254#ifdef HAVE_CUDA
2255 call cuda_set_stream(accel%cuda_stream, stream_number)
2256 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2257#endif
2258 end if
2259
2260 pop_sub(accel_set_stream)
2261 end subroutine accel_set_stream
2262
2263 ! ------------------------------------------------------
2264
2265 subroutine accel_get_stream(stream_number)
2266 integer, intent(inout) :: stream_number
2267
2268 push_sub(accel_get_stream)
2269
2270 if (accel_is_enabled()) then
2271#ifdef HAVE_CUDA
2272 call cuda_get_stream(stream_number)
2273#endif
2274 end if
2275
2276 pop_sub(accel_get_stream)
2277 end subroutine accel_get_stream
2278
2279 ! ------------------------------------------------------
2280
2283
2284 if (accel_is_enabled()) then
2285#ifdef HAVE_CUDA
2286 call cuda_synchronize_all_streams()
2287#endif
2288 end if
2289
2291 end subroutine accel_synchronize_all_streams
2292
2293 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2294 type(c_ptr), intent(in) :: buffer
2295 integer(int64), intent(in) :: offset
2296 type(c_ptr) :: buffer_offset
2297
2299#ifdef HAVE_CUDA
2300 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2301#else
2302 ! this is needed to make the compiler happy for non-GPU compilations
2303 buffer_offset = buffer
2304#endif
2307
2308 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2309 type(c_ptr), intent(in) :: buffer
2310 integer(int64), intent(in) :: offset
2311 type(c_ptr) :: buffer_offset
2312
2314#ifdef HAVE_CUDA
2315 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2316#else
2317 ! this is needed to make the compiler happy for non-GPU compilations
2318 buffer_offset = buffer
2319#endif
2322
2323 subroutine accel_clean_pointer(buffer)
2324 type(c_ptr), intent(in) :: buffer
2325
2326 push_sub(accel_clean_pointer)
2327#ifdef HAVE_CUDA
2328 call cuda_clean_pointer(buffer)
2329#endif
2330 pop_sub(accel_clean_pointer)
2331 end subroutine accel_clean_pointer
2336 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2337 integer(int64), intent(in) :: size
2338 integer(int64), intent(out) :: grid_size
2339 integer(int64), intent(out) :: thread_block_size
2340
2341 push_sub(accel_get_unfolded_size)
2342#ifdef __HIP_PLATFORM_AMD__
2343 ! not benefitial for AMD chips
2344 grid_size = size
2345 thread_block_size = size
2346#else
2347 grid_size = size * accel%warp_size
2348 thread_block_size = accel%warp_size
2349#endif
2351 end subroutine accel_get_unfolded_size
2352
2353#include "undef.F90"
2354#include "real.F90"
2355#include "accel_inc.F90"
2356
2357#include "undef.F90"
2358#include "complex.F90"
2359#include "accel_inc.F90"
2360
2361#include "undef.F90"
2362#include "integer.F90"
2363#include "accel_inc.F90"
2364
2365#include "undef.F90"
2366#include "integer8.F90"
2367#include "accel_inc.F90"
2368
2369end module accel_oct_m
2370
2371!! Local Variables:
2372!! mode: f90
2373!! coding: utf-8
2374!! End:
subroutine device_info()
Definition: accel.F90:688
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4345
integer, parameter opencl_accelerator
Definition: accel.F90:394
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:292
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2878
integer, parameter opencl_default
Definition: accel.F90:394
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:279
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:300
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1500
subroutine accel_kernel_global_end()
Definition: accel.F90:1325
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2422
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1513
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4085
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3113
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:429
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2065
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:1304
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2787
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2923
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3867
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1801
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:280
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4232
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2551
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2320
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1887
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:291
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3622
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2609
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4256
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3245
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:301
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2640
integer, parameter cl_plat_nvidia
Definition: accel.F90:401
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3094
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2950
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3168
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1381
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3144
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3672
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3463
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2484
integer, parameter cl_plat_ati
Definition: accel.F90:401
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1442
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:854
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2683
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1400
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3817
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:303
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1868
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1669
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2770
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2178
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3362
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1027
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2156
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:824
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3996
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1611
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3645
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2664
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3577
subroutine, public accel_finish()
Definition: accel.F90:990
subroutine opencl_check_bandwidth()
Definition: accel.F90:1245
subroutine accel_kernel_global_init()
Definition: accel.F90:1312
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2741
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2372
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3509
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4368
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:958
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:307
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1160
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4215
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1650
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1942
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2590
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1918
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:3765
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2089
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3424
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2446
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2465
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1209
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2855
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:982
integer, parameter, public accel_mem_read_write
Definition: accel.F90:196
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1142
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2201
subroutine accel_kernel_end(this)
Definition: accel.F90:1366
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:302
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3968
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2811
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1700
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4147
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1470
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3042
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3533
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1416
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3273
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1829
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:306
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3405
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3891
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2391
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4109
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4323
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3555
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2523
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2048
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3312
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1057
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:281
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1346
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:439
subroutine, public accel_end(namespace)
Definition: accel.F90:747
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3778
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1458
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1425
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:920
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4054
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:290
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2228
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4300
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3386
integer, parameter cl_plat_amd
Definition: accel.F90:401
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:845
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1225
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3910
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3600
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3206
pure logical function, public accel_is_enabled()
Definition: accel.F90:419
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1724
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1762
integer, parameter cl_plat_intel
Definition: accel.F90:401
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3187
integer, parameter, public accel_mem_write_only
Definition: accel.F90:196
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2019
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4128
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1961
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:278
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2133
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1093
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4395
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3331
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3836
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3929
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2833
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1485
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4186
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1598
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1743
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1110
integer, parameter opencl_cpu
Definition: accel.F90:394
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2333
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1127
type(accel_t), public accel
Definition: accel.F90:271
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4278
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3492
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1980
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3055
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2900
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:867
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4035
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:289
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:1408
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1010
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1104
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2702
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2111
type(accel_kernel_t), pointer head
Definition: accel.F90:413
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:631
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:554
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)