Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
46 use math_oct_m
48 use mpi_oct_m
50 use types_oct_m
51 use parser_oct_m
54
55 implicit none
56
57 private
58
59 public :: &
64 accel_t, &
67 accel_init, &
68 accel_end, &
100
101#ifdef HAVE_OPENCL
102 integer, public, parameter :: &
103 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
104 accel_mem_read_write = cl_mem_read_write, &
105 accel_mem_write_only = cl_mem_write_only
106#else
107 integer, public, parameter :: &
108 ACCEL_MEM_READ_ONLY = 0, &
111#endif
112
113 type accel_context_t
114 ! Components are public by default
115#ifdef HAVE_OPENCL
116 type(cl_context) :: cl_context
117#elif defined(HAVE_CUDA)
118 type(c_ptr) :: cuda_context
119#else
120 integer :: dummy
121#endif
122 end type accel_context_t
123
124 type accel_device_t
125 ! Components are public by default
126#ifdef HAVE_OPENCL
127 type(cl_device_id) :: cl_device
128#elif defined(HAVE_CUDA)
129 type(c_ptr) :: cuda_device
130#else
131 integer :: dummy
132#endif
133 end type accel_device_t
134
135 type accel_t
136 ! Components are public by default
137 type(accel_context_t) :: context
138 type(accel_device_t) :: device
139#ifdef HAVE_OPENCL
140 type(cl_command_queue) :: command_queue
141#endif
142 type(c_ptr) :: cublas_handle
143 type(c_ptr) :: cuda_stream
144 type(c_ptr) :: module_map
145 integer :: max_workgroup_size
146 integer(int64) :: local_memory_size
147 integer(int64) :: global_memory_size
148 logical :: enabled
149 logical :: allow_CPU_only
150 logical :: shared_mem
151 logical :: cuda_mpi
152 integer :: warp_size
153 logical :: initialize_buffers
154 character(len=32) :: debug_flag
155 integer(int64) :: max_block_dim(3)
156 integer(int64) :: max_grid_dim(3)
157 end type accel_t
158
159 type accel_mem_t
160 ! Components are public by default
161#ifdef HAVE_OPENCL
162 type(cl_mem) :: mem
163#else
164 type(c_ptr) :: mem
165#endif
166 integer(c_size_t) :: size = 0
167 type(type_t) :: type
168 integer :: flags = 0
169 logical :: allocated = .false.
170 end type accel_mem_t
171
172 type accel_kernel_t
173 ! Components are public by default
174#ifdef HAVE_OPENCL
175 type(cl_kernel) :: kernel
176#endif
177#ifdef HAVE_CUDA
178 type(c_ptr) :: cuda_kernel
179 type(c_ptr) :: cuda_module
180 type(c_ptr) :: arguments
181#endif
182 integer(int64) :: cuda_shared_mem
183 logical :: initialized = .false.
184 type(accel_kernel_t), pointer :: next
185 integer :: arg_count
186 character(len=128) :: kernel_name
188
189 type(accel_t), public :: accel
190
191 ! Global variables defined on device
192 type(accel_mem_t), public, save :: zm_0_buffer, zm_1_buffer
193 type(accel_mem_t), public, save :: dm_0_buffer, dm_1_buffer
194
195 ! the kernels
196 type(accel_kernel_t), public, target, save :: kernel_vpsi
197 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
198 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
199 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
200 type(accel_kernel_t), public, target, save :: kernel_daxpy
201 type(accel_kernel_t), public, target, save :: kernel_zaxpy
202 type(accel_kernel_t), public, target, save :: kernel_copy
203 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
204 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
205 type(accel_kernel_t), public, target, save :: dpack
206 type(accel_kernel_t), public, target, save :: zpack
207 type(accel_kernel_t), public, target, save :: dunpack
208 type(accel_kernel_t), public, target, save :: zunpack
209 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
210 type(accel_kernel_t), public, target, save :: kernel_density_real
211 type(accel_kernel_t), public, target, save :: kernel_density_complex
212 type(accel_kernel_t), public, target, save :: kernel_density_spinors
213 type(accel_kernel_t), public, target, save :: kernel_phase
214 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
215 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
216 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
217 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
218 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
219 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
220 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
221 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
222 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
223 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
224 type(accel_kernel_t), public, target, save :: dzmul
225 type(accel_kernel_t), public, target, save :: zzmul
226 type(accel_kernel_t), public, target, save :: set_one
228 ! kernels used locally
229 type(accel_kernel_t), target, save :: set_zero
230 type(accel_kernel_t), target, save :: set_zero_int
234 end interface accel_padded_size
242 end interface accel_kernel_run
243
246 end interface accel_set_buffer_to_zero
248 interface accel_write_buffer
262 end interface accel_write_buffer
273 end interface accel_read_buffer
275 interface accel_set_kernel_arg
276 module procedure &
301 module procedure &
309 module procedure &
316
317 integer, parameter :: &
318 OPENCL_GPU = -1, &
319 opencl_cpu = -2, &
320 opencl_accelerator = -3, &
322
323
324 integer, parameter :: &
325 CL_PLAT_INVALID = -1, &
326 cl_plat_amd = -2, &
327 cl_plat_nvidia = -3, &
328 cl_plat_ati = -4, &
330
331 ! a "convenience" public variable
332 integer, public :: cl_status
334 integer :: buffer_alloc_count
335 integer(int64) :: allocated_mem
336 type(accel_kernel_t), pointer :: head
337 type(alloc_cache_t) :: memcache
338
339contains
340
341 pure logical function accel_is_enabled() result(enabled)
342#ifdef HAVE_ACCEL
343 enabled = accel%enabled
344#else
345 enabled = .false.
346#endif
347 end function accel_is_enabled
348
349 ! ------------------------------------------
350
351 pure logical function accel_allow_cpu_only() result(allow)
352#ifdef HAVE_ACCEL
353 allow = accel%allow_CPU_only
354#else
355 allow = .true.
356#endif
357 end function accel_allow_cpu_only
358
359 ! ------------------------------------------
361 subroutine accel_init(base_grp, namespace)
362 type(mpi_grp_t), intent(inout) :: base_grp
363 type(namespace_t), intent(in) :: namespace
364
365 logical :: disable, default, run_benchmark
366 integer :: idevice, iplatform
367#ifdef HAVE_OPENCL
368 integer :: device_type
369 integer :: cl_status, idev
370 integer :: ndevices, ret_devices, nplatforms, iplat
371 character(len=256) :: device_name
372 type(cl_platform_id) :: platform_id
373 type(cl_program) :: prog
374 type(cl_platform_id), allocatable :: allplatforms(:)
375 type(cl_device_id), allocatable :: alldevices(:)
376 integer :: max_work_item_dimensions
377 integer(int64), allocatable :: max_work_item_sizes(:)
378#endif
379#ifdef HAVE_CUDA
380 integer :: dim
381#ifdef HAVE_MPI
382 character(len=256) :: sys_name
383#endif
384#endif
386 push_sub(accel_init)
387
388 buffer_alloc_count = 0
389
390 !%Variable DisableAccel
391 !%Type logical
392 !%Default yes
393 !%Section Execution::Accel
394 !%Description
395 !% If Octopus was compiled with OpenCL or CUDA support, it will
396 !% try to initialize and use an accelerator device. By setting this
397 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
398 !%End
399 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
400#ifdef HAVE_ACCEL
401 default = .false.
402#else
403 default = .true.
404#endif
405 call parse_variable(namespace, 'DisableAccel', default, disable)
406 accel%enabled = .not. disable
407
408#ifndef HAVE_ACCEL
409 if (accel%enabled) then
410 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
411 call messages_fatal(1)
412 end if
413#endif
414
415 if (.not. accel_is_enabled()) then
416 pop_sub(accel_init)
417 return
418 end if
420 !%Variable AccelPlatform
421 !%Type integer
422 !%Default 0
423 !%Section Execution::Accel
424 !%Description
425 !% This variable selects the OpenCL platform that Octopus will
426 !% use. You can give an explicit platform number or use one of
427 !% the options that select a particular vendor
428 !% implementation. Platform 0 is used by default.
429 !%
430 !% This variable has no effect for CUDA.
431 !%Option amd -2
432 !% Use the AMD OpenCL platform.
433 !%Option nvidia -3
434 !% Use the Nvidia OpenCL platform.
435 !%Option ati -4
436 !% Use the ATI (old AMD) OpenCL platform.
437 !%Option intel -5
438 !% Use the Intel OpenCL platform.
439 !%End
440 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
441
442 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
443
444 !%Variable AccelDevice
445 !%Type integer
446 !%Default gpu
447 !%Section Execution::Accel
448 !%Description
449 !% This variable selects the OpenCL or CUDA accelerator device
450 !% that Octopus will use. You can specify one of the options below
451 !% or a numerical id to select a specific device.
452 !%
453 !% Values >= 0 select the device to be used. In case of MPI enabled runs
454 !% devices are distributed in a round robin fashion, starting at this value.
455 !%Option gpu -1
456 !% If available, Octopus will use a GPU.
457 !%Option cpu -2
458 !% If available, Octopus will use a CPU (only for OpenCL).
459 !%Option accelerator -3
460 !% If available, Octopus will use an accelerator (only for OpenCL).
461 !%Option accel_default -4
462 !% Octopus will use the default device specified by the implementation.
463 !% implementation.
464 !%End
465 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
466
467 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
468
469 if (idevice < opencl_default) then
470 call messages_write('Invalid AccelDevice')
471 call messages_fatal()
472 end if
473
474 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
475
476#ifdef HAVE_CUDA
477 if (idevice<0) idevice = 0
478 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
479 idevice, base_grp%rank)
480#ifdef HAVE_MPI
481 call loct_sysname(sys_name)
482 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
483 " on ", trim(sys_name)
484 call messages_info(1, all_nodes = .true.)
485#endif
486
487 ! no shared mem support in our cuda interface (for the moment)
488 accel%shared_mem = .true.
489
490 call cublas_init(accel%cublas_handle, accel%cuda_stream)
491#endif
492
493#ifdef HAVE_OPENCL
494 call profiling_in('CL_INIT')
495
496 call clgetplatformids(nplatforms, cl_status)
497 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
498
499 safe_allocate(allplatforms(1:nplatforms))
500
501 call clgetplatformids(allplatforms, iplat, cl_status)
502 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
503
504 call messages_write('Info: Available CL platforms: ')
505 call messages_write(nplatforms)
506 call messages_info()
507
508 do iplat = 1, nplatforms
509
510 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
511
512 if (iplatform < 0) then
513 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
514 end if
515
516 if (iplatform == iplat - 1) then
517 call messages_write(' * Platform ')
518 else
519 call messages_write(' Platform ')
520 end if
521
522 call messages_write(iplat - 1)
523 call messages_write(' : '//device_name)
524 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
525 call messages_write(' ('//trim(device_name)//')')
526 call messages_info()
527 end do
528
529 call messages_info()
530
531 if (iplatform >= nplatforms .or. iplatform < 0) then
532 call messages_write('Requested CL platform does not exist')
533 if (iplatform > 0) then
534 call messages_write('(platform = ')
535 call messages_write(iplatform)
536 call messages_write(').')
537 end if
538 call messages_fatal()
539 end if
540
541 platform_id = allplatforms(iplatform + 1)
542
543 safe_deallocate_a(allplatforms)
544
545 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
546
547 call messages_write('Info: Available CL devices: ')
548 call messages_write(ndevices)
549 call messages_info()
550
551 safe_allocate(alldevices(1:ndevices))
552
553 ! list all devices
554
555 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
556
557 do idev = 1, ndevices
558 call messages_write(' Device ')
559 call messages_write(idev - 1)
560 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
561 call messages_write(' : '//device_name)
562 call messages_info()
563 end do
564
565 select case (idevice)
566 case (opencl_gpu)
567 device_type = cl_device_type_gpu
568 case (opencl_cpu)
569 device_type = cl_device_type_cpu
570 case (opencl_accelerator)
571 device_type = cl_device_type_accelerator
572 case (opencl_default)
573 device_type = cl_device_type_default
574 case default
575 device_type = cl_device_type_all
576 end select
577
578 ! now get a list of the selected type
579 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
580
581 if (ret_devices < 1) then
582 ! we didnt find a device of the selected type, we ask for the default device
583 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
584
585 if (ret_devices < 1) then
586 ! if this does not work, we ask for all devices
587 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
588 end if
589
590 if (ret_devices < 1) then
591 call messages_write('Cannot find an OpenCL device')
592 call messages_fatal()
593 end if
594 end if
595
596 ! the number of devices can be smaller
597 ndevices = ret_devices
598
599 if (idevice < 0) then
600 if (base_grp%size > 1) then
601 ! with MPI we have to select the device so multiple GPUs in one
602 ! node are correctly distributed
603 call select_device(idevice)
604 else
605 idevice = 0
606 end if
607 end if
608
609 if (idevice >= ndevices) then
610 call messages_write('Requested CL device does not exist (device = ')
611 call messages_write(idevice)
612 call messages_write(', platform = ')
613 call messages_write(iplatform)
614 call messages_write(').')
615 call messages_fatal()
616 end if
617
618 accel%device%cl_device = alldevices(idevice + 1)
619
620 ! create the context
621 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
622 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
623
624 safe_deallocate_a(alldevices)
625
626 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
627 cl_queue_profiling_enable, cl_status)
628 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
629
630 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
631
632 select case (device_type)
633 case (cl_device_type_gpu)
634 accel%shared_mem = .true.
635 case (cl_device_type_cpu, cl_device_type_accelerator)
636 accel%shared_mem = .false.
637 case default
638 accel%shared_mem = .false.
639 end select
640
641#ifdef HAVE_CLBLAS
642 call clblassetup(cl_status)
643 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
644#endif
645
646#ifdef HAVE_CLFFT
647 call clfftsetup(cl_status)
648 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
649#endif
650
651 call profiling_out('CL_INIT')
652#endif
653
654 ! Get some device information that we will need later
655
656 ! total memory
657#ifdef HAVE_OPENCL
658 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
659 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
660 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
661 accel%warp_size = 1
662 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_dimensions, max_work_item_dimensions, cl_status)
663 if (max_work_item_dimensions < 3) then
664 message(1) = "Octopus requires a device where CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS is at least 3."
665 call messages_fatal(1, only_root_writes = .true., namespace=namespace)
666 end if
667 safe_allocate(max_work_item_sizes(1:max_work_item_dimensions))
668 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_sizes, max_work_item_sizes(1), cl_status)
669 accel%max_block_dim(:) = max_work_item_sizes(1:3)
670 safe_deallocate_a(max_work_item_sizes)
671 ! In principle OpenCL does not set any limits on the global_work_size. It is
672 ! only limited by the available resources. Therefore we use the default
673 ! values for NVIDIA GPUs starting at CC 5.0. No idea whether these will work
674 ! generically.
675 ! https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features
676 ! -and-technical-specifications-technical-specifications-per-compute-capability
677 accel%max_grid_dim(1) = (2_int64)**31 - 1_int64
678 accel%max_grid_dim(2) = 65536_int64
679 accel%max_grid_dim(3) = 65536_int64
680#endif
681#ifdef HAVE_CUDA
682 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
683 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
684 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
685 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
686 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
687 accel%max_block_dim(1) = int(dim, int64)
688 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
689 accel%max_block_dim(2) = int(dim, int64)
690 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
691 accel%max_block_dim(3) = int(dim, int64)
692 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
693 accel%max_grid_dim(1) = int(dim, int64)
694 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
695 accel%max_grid_dim(2) = int(dim, int64)
696 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
697 accel%max_grid_dim(3) = int(dim, int64)
698#endif
699
700 if (mpi_grp_is_root(base_grp)) call device_info()
701
702 ! initialize the cache used to speed up allocations
703 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
704
705 ! now initialize the kernels
707
708#if defined(HAVE_HIP)
709 accel%debug_flag = "-g"
710#elif defined(HAVE_CUDA)
711 accel%debug_flag = "-lineinfo"
712#elif defined(HAVE_OPENCL)
713 accel%debug_flag = "-g"
714#endif
715
716 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
717 call accel_kernel_start_call(set_zero_int, 'set_zero.cl', "set_zero_int")
718 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
719 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
720 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
721 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
722 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
723 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
724 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
725 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
726 flags = ' -DRTYPE_DOUBLE')
727 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
728 flags = '-DRTYPE_COMPLEX')
729 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
730 flags = '-DRTYPE_DOUBLE')
731 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
732 flags = '-DRTYPE_COMPLEX')
733 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
734 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
735 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
736 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
737 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
738 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
739 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
740 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cl', "copy_complex_to_real")
741 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cl', "copy_real_to_complex")
742 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
743 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
744 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
745 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
746 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
747 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
748 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
749 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
750
751
752 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
753 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
754
755 ! Define global buffers
757 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
759 end if
761 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
763 end if
765 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
767 end if
769 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
771 end if
772
773
774 !%Variable AccelBenchmark
775 !%Type logical
776 !%Default no
777 !%Section Execution::Accel
778 !%Description
779 !% If this variable is set to yes, Octopus will run some
780 !% routines to benchmark the performance of the accelerator device.
781 !%End
782 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
783
784 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
785
786 if (run_benchmark) then
788 end if
789
790 !%Variable GPUAwareMPI
791 !%Type logical
792 !%Section Execution::Accel
793 !%Description
794 !% If Octopus was compiled with GPU support and MPI support and if the MPI
795 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
796 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
797 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
798 !% going through the host memory.
799 !% The default is false, except when the configure switch --enable-cudampi is set, in which
800 !% case this variable is set to true.
801 !%End
802#ifdef HAVE_CUDA_MPI
803 default = .true.
804#else
805 default = .false.
806#endif
807 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
808 if (accel%cuda_mpi) then
809#ifndef HAVE_CUDA_MPI
810 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
811 call messages_warning()
812#endif
813 call messages_write("Using GPU-aware MPI.")
814 call messages_info()
815 end if
816
817
818 !%Variable AllowCPUonly
819 !%Type logical
820 !%Section Execution::Accel
821 !%Description
822 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
823 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
824 !% code execution also in these cases.
825 !%End
826#if defined (HAVE_ACCEL)
827 default = .false.
828#else
829 default = .true.
830#endif
831 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
832
833
834 !%Variable InitializeGPUBuffers
835 !%Type logical
836 !%Section Execution::Accel
837 !%Description
838 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
839 !%End
840 call parse_variable(namespace, 'InitializeGPUBuffers', .false., accel%initialize_buffers)
841
842
843 call messages_print_with_emphasis(namespace=namespace)
844
845 pop_sub(accel_init)
846
847 contains
848
849#if defined(HAVE_OPENCL)
850 subroutine select_device(idevice)
851 integer, intent(inout) :: idevice
852 integer :: irank
853 character(len=256) :: device_name
854
855 push_sub(accel_init.select_device)
856
857 idevice = mod(base_grp%rank, ndevices)
858
859 call base_grp%barrier()
860 call messages_write('Info: CL device distribution:')
861 call messages_info()
862 do irank = 0, base_grp%size - 1
863 if (irank == base_grp%rank) then
864 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
865 call messages_write(' MPI node ')
866 call messages_write(base_grp%rank)
867 call messages_write(' -> CL device ')
868 call messages_write(idevice)
869 call messages_write(' : '//device_name)
870 call messages_info(all_nodes = .true.)
871 end if
872 call base_grp%barrier()
873 end do
874
876 end subroutine select_device
877#endif
878
879 subroutine device_info()
880#ifdef HAVE_OPENCL
881 integer(int64) :: val
882#endif
883#ifdef HAVE_CUDA
884 integer :: version
885#endif
886 integer :: major, minor
887 character(len=256) :: val_str
888
889 push_sub(accel_init.device_info)
890
891 call messages_new_line()
892 call messages_write('Selected device:')
893 call messages_new_line()
894
895#ifdef HAVE_OPENCL
896 call messages_write(' Framework : OpenCL')
897#endif
898#ifdef HAVE_CUDA
899#ifdef __HIP_PLATFORM_AMD__
900 call messages_write(' Framework : ROCm')
901#else
902 call messages_write(' Framework : CUDA')
903#endif
904#endif
905 call messages_info()
906
907#ifdef HAVE_CUDA
908 call messages_write(' Device type : GPU', new_line = .true.)
909#ifdef __HIP_PLATFORM_AMD__
910 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
911#else
912 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
913#endif
914#endif
915
916#ifdef HAVE_OPENCL
917 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
918 call messages_write(' Device type :')
919 select case (int(val, int32))
920 case (cl_device_type_gpu)
921 call messages_write(' GPU')
922 case (cl_device_type_cpu)
923 call messages_write(' CPU')
924 case (cl_device_type_accelerator)
925 call messages_write(' accelerator')
926 end select
927 call messages_new_line()
929 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
930 call messages_write(' Device vendor : '//trim(val_str))
931 call messages_new_line()
932#endif
933
934#ifdef HAVE_OPENCL
935 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
936#endif
937#ifdef HAVE_CUDA
938 call cuda_device_name(accel%device%cuda_device, val_str)
939#endif
940 call messages_write(' Device name : '//trim(val_str))
941 call messages_new_line()
942
943#ifdef HAVE_CUDA
944 call cuda_device_capability(accel%device%cuda_device, major, minor)
945#endif
946 call messages_write(' Cuda capabilities :')
947 call messages_write(major, fmt = '(i2)')
948 call messages_write('.')
949 call messages_write(minor, fmt = '(i1)')
950 call messages_new_line()
951
952 ! VERSION
953#ifdef HAVE_OPENCL
954 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
955 call messages_write(' Driver version : '//trim(val_str))
956#endif
957#ifdef HAVE_CUDA
958 call cuda_driver_version(version)
959 call messages_write(' Driver version : ')
960 call messages_write(version)
961#endif
962 call messages_new_line()
963
964
965#ifdef HAVE_OPENCL
966 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
967 call messages_write(' Compute units :')
968 call messages_write(val)
969 call messages_new_line()
970
971 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
972 call messages_write(' Clock frequency :')
973 call messages_write(val)
974 call messages_write(' GHz')
975 call messages_new_line()
976#endif
977
978 call messages_write(' Device memory :')
979 call messages_write(accel%global_memory_size, units=unit_megabytes)
980 call messages_new_line()
981
982 call messages_write(' Local/shared memory :')
983 call messages_write(accel%local_memory_size, units=unit_kilobytes)
984 call messages_new_line()
985
986
987#ifdef HAVE_OPENCL
988 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
989 call messages_write(' Max alloc size :')
990 call messages_write(val, units = unit_megabytes)
991 call messages_new_line()
992
993 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
994 call messages_write(' Device cache :')
995 call messages_write(val, units = unit_kilobytes)
996 call messages_new_line()
997
998 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
999 call messages_write(' Constant memory :')
1000 call messages_write(val, units = unit_kilobytes)
1001 call messages_new_line()
1002#endif
1003
1004 call messages_write(' Max. group/block size :')
1005 call messages_write(accel%max_workgroup_size)
1006 call messages_new_line()
1007
1008
1009#ifdef HAVE_OPENCL
1010 call messages_write(' Extension cl_khr_fp64 :')
1011 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
1012 call messages_new_line()
1013
1014 call messages_write(' Extension cl_amd_fp64 :')
1015 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
1016 call messages_new_line()
1017
1018 call messages_write(' Extension cl_khr_int64_base_atomics :')
1019 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
1020 call messages_new_line()
1021
1022#endif
1023
1024 call messages_info()
1025
1026
1027 pop_sub(accel_init.device_info)
1028 end subroutine device_info
1029
1030 end subroutine accel_init
1031
1032 ! ------------------------------------------
1033#ifdef HAVE_OPENCL
1034 integer function get_platform_id(platform_name) result(platform_id)
1035 character(len=*), intent(in) :: platform_name
1036
1037 platform_id = cl_plat_invalid
1038 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
1039 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
1040 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
1041 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
1042 end function get_platform_id
1043#endif
1044 ! ------------------------------------------
1045
1046 subroutine accel_end(namespace)
1047 type(namespace_t), intent(in) :: namespace
1048
1049#ifdef HAVE_OPENCL
1050 integer :: ierr
1051#endif
1052 integer(int64) :: hits, misses
1053 real(real64) :: volume_hits, volume_misses
1054 logical :: found
1055 type(accel_mem_t) :: tmp
1056
1057 push_sub(accel_end)
1058
1059 if (accel_is_enabled()) then
1060
1061 ! Release global buffers
1066
1067 do
1068 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
1069 if (.not. found) exit
1070
1071#ifdef HAVE_OPENCL
1072 call clreleasememobject(tmp%mem, ierr)
1073 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1074#endif
1075#ifdef HAVE_CUDA
1076 call cuda_mem_free(tmp%mem)
1077#endif
1078 end do
1079
1080 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1081
1082 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1083
1084 call messages_new_line()
1085 call messages_write(' Number of allocations =')
1086 call messages_write(hits + misses, new_line = .true.)
1087 call messages_write(' Volume of allocations =')
1088 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1089 new_line = .true.)
1090 call messages_write(' Hit ratio =')
1091 if (hits + misses > 0) then
1092 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1093 else
1094 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1095 end if
1096 call messages_write('%', new_line = .true.)
1097 call messages_write(' Volume hit ratio =')
1098 if (volume_hits + volume_misses > 0) then
1099 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1100 else
1101 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1102 end if
1103 call messages_write('%')
1104 call messages_new_line()
1105 call messages_info()
1106
1107 call messages_print_with_emphasis(namespace=namespace)
1108 end if
1109
1111
1112#ifdef HAVE_CLBLAS
1113 call clblasteardown()
1114#endif
1115
1116#ifdef HAVE_CLFFT
1117 call clfftteardown()
1118#endif
1119
1120 if (accel_is_enabled()) then
1121#ifdef HAVE_CUDA
1122 call cublas_end(accel%cublas_handle)
1123 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1124 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1125 end if
1126#endif
1127
1128#ifdef HAVE_OPENCL
1129 call clreleasecommandqueue(accel%command_queue, ierr)
1130
1131 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1132 call clreleasecontext(accel%context%cl_context, cl_status)
1133#endif
1134
1135 if (buffer_alloc_count /= 0) then
1136 call messages_write('Accel:')
1137 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1138 call messages_write(' in ')
1139 call messages_write(buffer_alloc_count)
1140 call messages_write(' buffers were not deallocated.')
1141 call messages_fatal()
1142 end if
1143
1144 end if
1145
1146 pop_sub(accel_end)
1147 end subroutine accel_end
1148
1149 ! ------------------------------------------
1150
1151 integer(int64) function accel_padded_size_i8(nn) result(psize)
1152 integer(int64), intent(in) :: nn
1153
1154 integer(int64) :: modnn, bsize
1155
1156 psize = nn
1157
1158 if (accel_is_enabled()) then
1159
1160 bsize = accel_max_workgroup_size()
1161
1162 psize = nn
1163 modnn = mod(nn, bsize)
1164 if (modnn /= 0) psize = psize + bsize - modnn
1165
1166 end if
1167
1168 end function accel_padded_size_i8
1169
1170 ! ------------------------------------------
1171
1172 integer(int32) function accel_padded_size_i4(nn) result(psize)
1173 integer(int32), intent(in) :: nn
1174
1175 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1176
1177 end function accel_padded_size_i4
1178
1179 ! ------------------------------------------
1180
1181 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
1182 type(accel_mem_t), intent(inout) :: this
1183 integer, intent(in) :: flags
1184 type(type_t), intent(in) :: type
1185 integer, intent(in) :: size
1186 logical, optional, intent(in) :: set_zero
1187 logical, optional, intent(in) :: async
1188
1189 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
1190 end subroutine accel_create_buffer_4
1191
1192 ! ------------------------------------------
1193
1194 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
1195 type(accel_mem_t), intent(inout) :: this
1196 integer, intent(in) :: flags
1197 type(type_t), intent(in) :: type
1198 integer(int64), intent(in) :: size
1199 logical, optional, intent(in) :: set_zero
1200 logical, optional, intent(in) :: async
1201
1202 integer(int64) :: fsize
1203 logical :: found
1204#ifdef HAVE_OPENCL
1205 integer :: ierr
1206#endif
1207
1208 push_sub(accel_create_buffer_8)
1209
1210 this%type = type
1211 this%size = size
1212 this%flags = flags
1213 fsize = int(size, int64)*types_get_size(type)
1214 this%allocated = .true.
1215
1216 if (fsize > 0) then
1217
1218 call alloc_cache_get(memcache, fsize, found, this%mem)
1219
1220 if (.not. found) then
1221#ifdef HAVE_OPENCL
1222 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1223 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1224#endif
1225#ifdef HAVE_CUDA
1226 if(optional_default(async, .false.)) then
1227 call cuda_mem_alloc_async(this%mem, fsize)
1228 else
1229 call cuda_mem_alloc(this%mem, fsize)
1230 end if
1231#endif
1232 end if
1233
1234 buffer_alloc_count = buffer_alloc_count + 1
1235 allocated_mem = allocated_mem + fsize
1236
1237 end if
1238
1239 if(optional_default(set_zero, accel%initialize_buffers)) then
1240 if(optional_default(async, .false.)) then
1241 call cuda_mem_set_async(this%mem, 0, fsize)
1242 else
1243 call accel_set_buffer_to_zero_i8(this, type, size)
1244 end if
1245 endif
1246
1247 pop_sub(accel_create_buffer_8)
1248 end subroutine accel_create_buffer_8
1249
1250 ! ------------------------------------------
1251
1252 subroutine accel_release_buffer(this, async)
1253 type(accel_mem_t), intent(inout) :: this
1254 logical, optional, intent(in) :: async
1255
1256#ifdef HAVE_OPENCL
1257 integer :: ierr
1258#endif
1259 logical :: put
1260 integer(int64) :: fsize
1261
1262 push_sub(accel_release_buffer)
1263
1264 if (this%size > 0) then
1265
1266 fsize = int(this%size, int64)*types_get_size(this%type)
1267
1268 call alloc_cache_put(memcache, fsize, this%mem, put)
1269
1270 if (.not. put) then
1271#ifdef HAVE_OPENCL
1272 call clreleasememobject(this%mem, ierr)
1273 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1274#endif
1275#ifdef HAVE_CUDA
1276 if (optional_default(async, .false.)) then
1277 call cuda_mem_free_async(this%mem)
1278 else
1279 call cuda_mem_free(this%mem)
1280 end if
1281#endif
1282 end if
1283
1284 buffer_alloc_count = buffer_alloc_count - 1
1285 allocated_mem = allocated_mem + fsize
1286
1287 end if
1288
1289 this%size = 0
1290 this%flags = 0
1291
1292 this%allocated = .false.
1293
1294 pop_sub(accel_release_buffer)
1295 end subroutine accel_release_buffer
1296
1297 ! ------------------------------------------------------
1298
1299 ! Check if the temporary buffers are the right size, if not reallocate them
1300 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1301 type(accel_mem_t), intent(inout) :: buffer
1302 integer, intent(in) :: flags
1303 type(type_t), intent(in) :: type
1304 integer, intent(in) :: required_size
1305 logical, intent(in) :: set_zero
1306 logical, optional, intent(in) :: async
1307
1308 push_sub(accel_ensure_buffer_size)
1309
1310 if (buffer%size < required_size) then
1311 call accel_release_buffer(buffer, async=optional_default(async, .false.))
1312 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1313 end if
1314
1316 end subroutine accel_ensure_buffer_size
1317
1318 ! ------------------------------------------
1319
1320 logical pure function accel_buffer_is_allocated(this) result(allocated)
1321 type(accel_mem_t), intent(in) :: this
1322
1323 allocated = this%allocated
1324 end function accel_buffer_is_allocated
1325
1326 ! -----------------------------------------
1327
1328 subroutine accel_finish()
1329#ifdef HAVE_OPENCL
1330 integer :: ierr
1331#endif
1332
1333 ! no push_sub, called too frequently
1334
1335 if (accel_is_enabled()) then
1336#ifdef HAVE_OPENCL
1337 call clfinish(accel%command_queue, ierr)
1338 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1339#endif
1340#ifdef HAVE_CUDA
1342#endif
1343 end if
1344 end subroutine accel_finish
1345
1346 ! ------------------------------------------
1347
1348 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1349 type(accel_kernel_t), intent(inout) :: kernel
1350 integer, intent(in) :: narg
1351 type(accel_mem_t), intent(in) :: buffer
1352
1353#ifdef HAVE_OPENCL
1354 integer :: ierr
1355#endif
1356
1357 assert(accel_buffer_is_allocated(buffer))
1358
1359 ! no push_sub, called too frequently
1360#ifdef HAVE_OPENCL
1361 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1362 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1363#endif
1364
1365#ifdef HAVE_CUDA
1366 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1367#endif
1368
1369 end subroutine accel_set_kernel_arg_buffer
1370
1371 ! ------------------------------------------
1372
1373 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1374 type(accel_kernel_t), intent(inout) :: kernel
1375 integer, intent(in) :: narg
1376 type(type_t), intent(in) :: type
1377 integer, intent(in) :: size
1379#ifdef HAVE_OPENCL
1380 integer :: ierr
1381#endif
1382 integer(int64) :: size_in_bytes
1383
1385
1386
1387 size_in_bytes = int(size, int64)*types_get_size(type)
1388
1389 if (size_in_bytes > accel%local_memory_size) then
1390 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1391 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1392 call messages_fatal(2)
1393 else if (size_in_bytes <= 0) then
1394 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1395 call messages_fatal(1)
1396 end if
1397
1398#ifdef HAVE_CUDA
1399 kernel%cuda_shared_mem = size_in_bytes
1400#endif
1401
1402#ifdef HAVE_OPENCL
1403 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1404 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1405#endif
1408 end subroutine accel_set_kernel_arg_local
1409
1410 ! ------------------------------------------
1411
1412 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1413 type(accel_kernel_t), intent(inout) :: kernel
1414 integer(int64), intent(in) :: globalsizes(:)
1415 integer(int64), intent(in) :: localsizes(:)
1416
1417 integer :: dim
1418#ifdef HAVE_OPENCL
1419 integer :: ierr
1420#endif
1421 integer(int64) :: gsizes(1:3)
1422 integer(int64) :: lsizes(1:3)
1423
1424 ! no push_sub, called too frequently
1425
1426 ! cuda needs all dimensions
1427 gsizes = 1
1428 lsizes = 1
1429
1430 dim = ubound(globalsizes, dim=1)
1431
1432 assert(dim == ubound(localsizes, dim=1))
1433
1434 ! if one size is zero, there is nothing to do
1435 if (any(globalsizes == 0)) return
1436
1437 assert(all(localsizes > 0))
1438 assert(all(localsizes <= accel_max_workgroup_size()))
1439 assert(all(mod(globalsizes, localsizes) == 0))
1440
1441 gsizes(1:dim) = globalsizes(1:dim)
1442 lsizes(1:dim) = localsizes(1:dim)
1443
1444#ifdef HAVE_OPENCL
1445 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1446 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1447#endif
1448
1449#ifdef HAVE_CUDA
1450 ! Maximum dimension of a block
1451 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1452 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1453 message(2) = "The following conditions should be fulfilled:"
1454 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1455 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1456 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1457 message(6) = "This is an internal error, please contact the developers."
1458 call messages_fatal(6)
1459 end if
1460
1461
1462 ! Maximum number of threads per block
1463 if (product(lsizes) > accel_max_workgroup_size()) then
1464 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1465 message(2) = "The following condition should be fulfilled:"
1466 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1467 message(4) = "This is an internal error, please contact the developers."
1468 call messages_fatal(4)
1469 end if
1470
1471 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1472
1473 ! Maximum dimensions of the grid of thread block
1474 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1475 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1476 message(2) = "The following conditions should be fulfilled:"
1477 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1478 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1479 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1480 message(6) = "This is an internal error, please contact the developers."
1481 call messages_fatal(6)
1482 end if
1483
1484 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1485
1486 kernel%cuda_shared_mem = 0
1487#endif
1488
1489 end subroutine accel_kernel_run_8
1491 ! -----------------------------------------------
1492
1493 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1494 type(accel_kernel_t), intent(inout) :: kernel
1495 integer, intent(in) :: globalsizes(:)
1496 integer, intent(in) :: localsizes(:)
1497
1498 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1499
1500 end subroutine accel_kernel_run_4
1501
1502 ! -----------------------------------------------
1503
1504 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1505 max_workgroup_size = accel%max_workgroup_size
1506 end function accel_max_workgroup_size
1507
1508 ! -----------------------------------------------
1509
1510 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1511 type(accel_kernel_t), intent(inout) :: kernel
1512
1513#ifdef HAVE_OPENCL
1514 integer(int64) :: workgroup_size8
1515 integer :: ierr
1516#endif
1517#ifdef HAVE_CUDA
1518 integer :: max_workgroup_size
1519#endif
1520
1521 workgroup_size = 0
1522
1523#ifdef HAVE_OPENCL
1524 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1525 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1526 workgroup_size = workgroup_size8
1527#endif
1528
1529#ifdef HAVE_CUDA
1530 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1531 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1532 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1533 "than available on the device (", accel%max_workgroup_size, ")"
1534 call messages_info(1)
1535 end if
1536 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1537 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1538 workgroup_size = 256
1539 ! make sure we do not use more threads per block than available for this kernel
1540 workgroup_size = min(workgroup_size, max_workgroup_size)
1541#endif
1542
1543 end function accel_kernel_workgroup_size
1544
1545 ! -----------------------------------------------
1546
1547#ifdef HAVE_OPENCL
1548 subroutine opencl_build_program(prog, filename, flags)
1549 type(cl_program), intent(inout) :: prog
1550 character(len=*), intent(in) :: filename
1551 character(len=*), optional, intent(in) :: flags
1552
1553 character(len = 1000) :: string
1554 character(len = 256) :: share_string
1555 integer :: ierr, ierrlog, iunit, irec, newlen
1556
1557 push_sub(opencl_build_program)
1558
1559 string = '#include "'//trim(filename)//'"'
1560
1561 call messages_write("Building CL program '"//trim(filename)//"'.")
1562 call messages_info(debug_only=.true.)
1563
1564 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1565 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1566
1567 ! build the compilation flags
1568 string='-w'
1569 ! full optimization
1570 string=trim(string)//' -cl-denorms-are-zero'
1571 ! The following flag gives an error with the Xeon Phi
1572 ! string=trim(string)//' -cl-strict-aliasing'
1573 string=trim(string)//' -cl-mad-enable'
1574 string=trim(string)//' -cl-unsafe-math-optimizations'
1575 string=trim(string)//' -cl-finite-math-only'
1576 string=trim(string)//' -cl-fast-relaxed-math'
1577
1578 share_string='-I'//trim(conf%share)//'/opencl/'
1579
1580 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1581 string = trim(string)//' -DEXT_KHR_FP64'
1582 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1583 string = trim(string)//' -DEXT_AMD_FP64'
1584 else
1585 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1586 call messages_fatal()
1587 end if
1589 if (accel_use_shared_mem()) then
1590 string = trim(string)//' -DSHARED_MEM'
1591 end if
1592
1593 if (present(flags)) then
1594 string = trim(string)//' '//trim(flags)
1595 end if
1596
1597 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1598 call messages_write(' '//trim(share_string)//"'.")
1599 call messages_info(debug_only=.true.)
1600
1601 string = trim(string)//' '//trim(share_string)
1602
1603 call clbuildprogram(prog, trim(string), ierr)
1604
1605 if(ierr /= cl_success) then
1606 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1607 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1608
1609 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1610 newlen = scan(string, achar(010), back = .true.) - 1
1611 if (newlen >= 0) string = string(1:newlen)
1612
1613 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1614
1615 call opencl_print_error(ierr, "clBuildProgram")
1616 end if
1617
1618 pop_sub(opencl_build_program)
1619 end subroutine opencl_build_program
1620#endif
1621
1622 ! -----------------------------------------------
1623#ifdef HAVE_OPENCL
1624 subroutine opencl_release_program(prog)
1625 type(cl_program), intent(inout) :: prog
1627 integer :: ierr
1628
1629 push_sub(opencl_release_program)
1630
1631 call clreleaseprogram(prog, ierr)
1632 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1633
1634 pop_sub(opencl_release_program)
1635 end subroutine opencl_release_program
1636#endif
1637
1638 ! -----------------------------------------------
1639
1640#ifdef HAVE_OPENCL
1641 subroutine opencl_release_kernel(prog)
1642 type(cl_kernel), intent(inout) :: prog
1643
1644 integer :: ierr
1645
1646 push_sub(opencl_release_kernel)
1647
1648#ifdef HAVE_OPENCL
1649 call clreleasekernel(prog, ierr)
1650 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1651#endif
1652
1653 pop_sub(opencl_release_kernel)
1654 end subroutine opencl_release_kernel
1655#endif
1656
1657#ifdef HAVE_OPENCL
1658 ! -----------------------------------------------
1659 subroutine opencl_create_kernel(kernel, prog, name)
1660 type(cl_kernel), intent(inout) :: kernel
1661 type(cl_program), intent(inout) :: prog
1662 character(len=*), intent(in) :: name
1663
1664 integer :: ierr
1665
1666 push_sub(opencl_create_kernel)
1667 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1668
1669#ifdef HAVE_OPENCL
1670 kernel = clcreatekernel(prog, name, ierr)
1671 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1672#endif
1673
1674 call profiling_out("CL_BUILD_KERNEL")
1675 pop_sub(opencl_create_kernel)
1676 end subroutine opencl_create_kernel
1677#endif
1678
1679 ! ------------------------------------------------
1680#ifdef HAVE_OPENCL
1681 subroutine opencl_print_error(ierr, name)
1682 integer, intent(in) :: ierr
1683 character(len=*), intent(in) :: name
1684
1685 character(len=40) :: errcode
1686
1687 push_sub(opencl_print_error)
1688
1689 select case (ierr)
1690 case (cl_success); errcode = 'CL_SUCCESS '
1691 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1692 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1693 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1694 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1695 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1696 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1697 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1698 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1699 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1700 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1701 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1702 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1703 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1704 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1705 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1706 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1707 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1708 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1709 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1710 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1711 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1712 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1713 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1714 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1715 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1716 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1717 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1718 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1719 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1720 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1721 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1722 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1723 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1724 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1725 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1726 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1727 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1728 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1729 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1730 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1731 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1732 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1733 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1734 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1735 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1736 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1737 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1738 case default
1739 write(errcode, '(i10)') ierr
1740 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1741 end select
1742
1743 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1744 call messages_fatal(1)
1745
1746 pop_sub(opencl_print_error)
1747 end subroutine opencl_print_error
1748#endif
1749
1750 ! ----------------------------------------------------
1751
1752 subroutine clblas_print_error(ierr, name)
1753 integer, intent(in) :: ierr
1754 character(len=*), intent(in) :: name
1755
1756 character(len=40) :: errcode
1757
1758 push_sub(clblas_print_error)
1759#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1760 select case (ierr)
1761 case (clblassuccess); errcode = 'clblasSuccess'
1762 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1763 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1764 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1765 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1766 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1767 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1768 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1769 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1770 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1771 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1772 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1773 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1774 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1775 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1776 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1777 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1778 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1779 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1780 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1781 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1782 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1783 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1784 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1785 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1786 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1787 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1788 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1789 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1790 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1791#ifdef HAVE_CLBLAST
1792 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1793 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1794 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1795 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1796 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1797 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1798 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1799 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1800 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1801 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1802 case (clblastunknownerror); errcode = 'clblastUnknownError'
1803 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1804#endif
1805
1806 case default
1807 write(errcode, '(i10)') ierr
1808 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1809 end select
1810#endif
1811
1812 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1813 call messages_fatal(1)
1814
1815 pop_sub(clblas_print_error)
1816 end subroutine clblas_print_error
1817
1818 ! ----------------------------------------------------
1819 subroutine clfft_print_error(ierr, name)
1820 integer, intent(in) :: ierr
1821 character(len=*), intent(in) :: name
1822
1823 character(len=40) :: errcode
1824
1825 push_sub(clfft_print_error)
1826#ifdef HAVE_CLFFT
1827 select case (ierr)
1828 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1829 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1830 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1831 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1832 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1833 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1834 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1835 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1836 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1837 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1838 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1839 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1840 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1841 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1842 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1843 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1844 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1845 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1846 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1847 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1848 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1849 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1850 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1851 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1852 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1853 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1854 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1855 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1856 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1857 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1858 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1859 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1860 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1861 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1862 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1863 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1864 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1865 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1866 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1867 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1868 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1869 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1870 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1871 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1872 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1873 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1874 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1875 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1876 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1877 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1878 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1879 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1880 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1881 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1882 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1883 case default
1884 write(errcode, '(i10)') ierr
1885 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1886 end select
1887#endif
1888
1889 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1890 call messages_fatal(1)
1891
1892 pop_sub(clfft_print_error)
1893 end subroutine clfft_print_error
1894
1895 ! ----------------------------------------------------
1896
1897#ifdef HAVE_OPENCL
1898 logical function f90_cl_device_has_extension(device, extension) result(has)
1899 type(cl_device_id), intent(inout) :: device
1900 character(len=*), intent(in) :: extension
1901
1902 integer :: cl_status
1903 character(len=2048) :: all_extensions
1904
1905#ifdef HAVE_OPENCL
1906 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1907#endif
1908
1909 has = index(all_extensions, extension) /= 0
1910
1911 end function f90_cl_device_has_extension
1912#endif
1913
1914 ! ----------------------------------------------------
1915
1916 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1917 type(accel_mem_t), intent(inout) :: buffer
1918 type(type_t), intent(in) :: type
1919 integer(int64), intent(in) :: nval
1920 integer(int64), optional, intent(in) :: offset
1921 logical, optional, intent(in) :: async
1922
1923 integer :: bsize
1924 integer(int64) :: nval_real, offset_real
1925 type(accel_kernel_t), pointer :: kernel
1926
1927 if (nval == 0) return
1928
1930
1931 nval_real = nval
1932 if (type == type_cmplx) nval_real = nval_real * 2
1933 if (present(offset)) then
1934 offset_real = offset
1935 if (type == type_cmplx) offset_real = offset_real * 2
1936 else
1937 offset_real = 0_int64
1938 end if
1939
1940 assert(nval_real > 0)
1941
1942 if (type == type_integer) then
1943 kernel => set_zero_int
1944 else
1945 kernel => set_zero
1946 end if
1947 call accel_set_kernel_arg(kernel, 0, nval_real)
1948 call accel_set_kernel_arg(kernel, 1, offset_real)
1949 call accel_set_kernel_arg(kernel, 2, buffer)
1950
1951 bsize = accel_kernel_workgroup_size(kernel)
1952
1953 call accel_kernel_run(kernel, (/ nval_real /), (/ 1_int64 /))
1954
1955 if(.not. optional_default(async, .false.)) call accel_finish()
1956
1958 end subroutine accel_set_buffer_to_zero_i8
1959
1960 ! ----------------------------------------------------
1961
1962 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1963 type(accel_mem_t), intent(inout) :: buffer
1964 type(type_t), intent(in) :: type
1965 integer(int32), intent(in) :: nval
1966 integer(int32), optional, intent(in) :: offset
1967 logical, optional, intent(in) :: async
1968
1970
1971 if (present(offset)) then
1972 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1973 else
1974 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1975 end if
1978 end subroutine accel_set_buffer_to_zero_i4
1979
1980 ! ----------------------------------------------------
1981
1982 subroutine opencl_check_bandwidth()
1983 integer :: itime
1984 integer, parameter :: times = 10
1985 integer :: size
1986 real(real64) :: time, stime
1987 real(real64) :: read_bw, write_bw
1988 type(accel_mem_t) :: buff
1989 real(real64), allocatable :: data(:)
1990
1991 call messages_new_line()
1992 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1993 call messages_new_line()
1994 call messages_info()
1995
1996 call messages_write(' Buffer size Read bw Write bw')
1997 call messages_new_line()
1998 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1999 call messages_info()
2000
2001 size = 15000
2002 do
2003 safe_allocate(data(1:size))
2004 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
2005
2006 stime = loct_clock()
2007 do itime = 1, times
2008 call accel_write_buffer(buff, size, data)
2009 call accel_finish()
2010 end do
2011 time = (loct_clock() - stime)/real(times, real64)
2012
2013 write_bw = real(size, real64) *8.0_real64/time
2014
2015 stime = loct_clock()
2016 do itime = 1, times
2017 call accel_read_buffer(buff, size, data)
2018 end do
2019 call accel_finish()
2020
2021 time = (loct_clock() - stime)/real(times, real64)
2022 read_bw = real(size, real64) *8.0_real64/time
2023
2024 call messages_write(size*8.0_real64/1024.0_real64**2)
2025 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
2026 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
2027 call messages_info()
2028
2029 call accel_release_buffer(buff)
2030
2031 safe_deallocate_a(data)
2032
2033 size = int(size*2.0)
2034
2035 if (size > 50000000) exit
2036 end do
2037 end subroutine opencl_check_bandwidth
2038
2039 ! ----------------------------------------------------
2041 logical pure function accel_use_shared_mem() result(use_shared_mem)
2042
2043 use_shared_mem = accel%shared_mem
2044
2045 end function accel_use_shared_mem
2046
2047 !------------------------------------------------------------
2048
2049 subroutine accel_kernel_global_init()
2050
2051 push_sub(accel_kernel_global_init)
2052
2053 nullify(head)
2054
2055 call cuda_module_map_init(accel%module_map)
2056
2058 end subroutine accel_kernel_global_init
2059
2060 !------------------------------------------------------------
2061
2062 subroutine accel_kernel_global_end()
2063 type(accel_kernel_t), pointer :: next_head
2064
2065 push_sub(accel_kernel_global_end)
2066
2067 do
2068 if (.not. associated(head)) exit
2069 next_head => head%next
2071 head => next_head
2072 end do
2073
2074 if (accel_is_enabled()) then
2075 call cuda_module_map_end(accel%module_map)
2076 end if
2077
2079 end subroutine accel_kernel_global_end
2080
2081 !------------------------------------------------------------
2082
2083 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
2084 type(accel_kernel_t), intent(inout) :: this
2085 character(len=*), intent(in) :: file_name
2086 character(len=*), intent(in) :: kernel_name
2087 character(len=*), optional, intent(in) :: flags
2088
2089#ifdef HAVE_OPENCL
2090 type(cl_program) :: prog
2091#endif
2092#ifdef HAVE_CUDA
2093 character(len=1000) :: all_flags
2094#endif
2095
2096 push_sub(accel_kernel_build)
2097
2098 call profiling_in("ACCEL_COMPILE", exclude = .true.)
2099
2100#ifdef HAVE_CUDA
2101 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
2102
2103 if (accel_use_shared_mem()) then
2104 all_flags = trim(all_flags)//' -DSHARED_MEM'
2105 end if
2106
2107 if (present(flags)) then
2108 all_flags = trim(all_flags)//' '//trim(flags)
2109 end if
2110
2111 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2112
2113 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2114 call cuda_alloc_arg_array(this%arguments)
2115
2116 this%cuda_shared_mem = 0
2117#endif
2118
2119#ifdef HAVE_OPENCL
2120 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2121 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2122 call opencl_release_program(prog)
2123#endif
2124
2125 this%initialized = .true.
2126 this%kernel_name = trim(kernel_name)
2128 call profiling_out("ACCEL_COMPILE")
2129
2130 pop_sub(accel_kernel_build)
2131 end subroutine accel_kernel_build
2132
2133 !------------------------------------------------------------
2134
2135 subroutine accel_kernel_end(this)
2136 type(accel_kernel_t), intent(inout) :: this
2137#ifdef HAVE_OPENCL
2138 integer :: ierr
2139#endif
2141 push_sub(accel_kernel_end)
2142
2143#ifdef HAVE_CUDA
2144 call cuda_free_arg_array(this%arguments)
2145 call cuda_release_kernel(this%cuda_kernel)
2146 ! modules are not released here, since they are not associated to a kernel
2147#endif
2148
2149#ifdef HAVE_OPENCL
2150 call clreleasekernel(this%kernel, ierr)
2151 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2152#endif
2153 this%initialized = .false.
2154
2155 pop_sub(accel_kernel_end)
2156 end subroutine accel_kernel_end
2157
2158 !------------------------------------------------------------
2159
2160 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2161 type(accel_kernel_t), target, intent(inout) :: this
2162 character(len=*), intent(in) :: file_name
2163 character(len=*), intent(in) :: kernel_name
2164 character(len=*), optional, intent(in) :: flags
2165
2166 push_sub(accel_kernel_start_call)
2167
2168 if (.not. this%initialized) then
2169 call accel_kernel_build(this, file_name, kernel_name, flags)
2170 this%next => head
2171 head => this
2172 end if
2173
2175 end subroutine accel_kernel_start_call
2176
2177 !--------------------------------------------------------------
2178
2179 integer(int64) pure function accel_global_memory_size() result(size)
2180
2181 size = accel%global_memory_size
2182
2183 end function accel_global_memory_size
2184
2185 !--------------------------------------------------------------
2186
2187 integer(int64) pure function accel_local_memory_size() result(size)
2188
2189 size = accel%local_memory_size
2190
2191 end function accel_local_memory_size
2192
2193 !--------------------------------------------------------------
2194
2195 integer pure function accel_max_size_per_dim(dim) result(size)
2196 integer, intent(in) :: dim
2197
2198 size = 0
2199#ifdef HAVE_OPENCL
2200 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2201 ! value of zero when multiplied by 2048 and converted to integer 4.
2202 if (dim == 1) size = 2**30
2203#endif
2204#ifdef HAVE_CUDA
2205 size = 32768
2206 if (dim == 1) size = 2**30
2207#endif
2208 end function accel_max_size_per_dim
2209
2210 ! ------------------------------------------------------
2211
2212 subroutine accel_set_stream(stream_number)
2213 integer, intent(in) :: stream_number
2214
2215 push_sub(accel_set_stream)
2216
2217 if (accel_is_enabled()) then
2218#ifdef HAVE_CUDA
2219 call cuda_set_stream(accel%cuda_stream, stream_number)
2220 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2221#endif
2222 end if
2223
2224 pop_sub(accel_set_stream)
2225 end subroutine accel_set_stream
2226
2227 ! ------------------------------------------------------
2228
2229 subroutine accel_get_stream(stream_number)
2230 integer, intent(inout) :: stream_number
2231
2232 push_sub(accel_get_stream)
2233
2234 if (accel_is_enabled()) then
2235#ifdef HAVE_CUDA
2236 call cuda_get_stream(stream_number)
2237#endif
2238 end if
2239
2240 pop_sub(accel_get_stream)
2241 end subroutine accel_get_stream
2242
2243 ! ------------------------------------------------------
2244
2247
2248 if (accel_is_enabled()) then
2249#ifdef HAVE_CUDA
2250 call cuda_synchronize_all_streams()
2251#endif
2252 end if
2253
2255 end subroutine accel_synchronize_all_streams
2256
2257 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2258 type(c_ptr), intent(in) :: buffer
2259 integer(int64), intent(in) :: offset
2260 type(c_ptr) :: buffer_offset
2261
2263#ifdef HAVE_CUDA
2264 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2265#else
2266 ! this is needed to make the compiler happy for non-GPU compilations
2267 buffer_offset = buffer
2268#endif
2271
2272 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2273 type(c_ptr), intent(in) :: buffer
2274 integer(int64), intent(in) :: offset
2275 type(c_ptr) :: buffer_offset
2276
2278#ifdef HAVE_CUDA
2279 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2280#else
2281 ! this is needed to make the compiler happy for non-GPU compilations
2282 buffer_offset = buffer
2283#endif
2286
2287 subroutine accel_clean_pointer(buffer)
2288 type(c_ptr), intent(in) :: buffer
2289
2291#ifdef HAVE_CUDA
2292 call cuda_clean_pointer(buffer)
2293#endif
2294 pop_sub(accel_clean_pointer)
2295 end subroutine accel_clean_pointer
2296
2300 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2301 integer(int64), intent(in) :: size
2302 integer(int64), intent(out) :: grid_size
2303 integer(int64), intent(out) :: thread_block_size
2304
2305 push_sub(accel_get_unfolded_size)
2306#ifdef __HIP_PLATFORM_AMD__
2307 ! not benefitial for AMD chips
2308 grid_size = size
2309 thread_block_size = size
2310#else
2311 grid_size = size * accel%warp_size
2312 thread_block_size = accel%warp_size
2313#endif
2315 end subroutine accel_get_unfolded_size
2316
2317#include "undef.F90"
2318#include "real.F90"
2319#include "accel_inc.F90"
2320
2321#include "undef.F90"
2322#include "complex.F90"
2323#include "accel_inc.F90"
2324
2325#include "undef.F90"
2326#include "integer.F90"
2327#include "accel_inc.F90"
2328
2329#include "undef.F90"
2330#include "integer8.F90"
2331#include "accel_inc.F90"
2332
2333end module accel_oct_m
2334
2335!! Local Variables:
2336!! mode: f90
2337!! coding: utf-8
2338!! End:
subroutine select_device(idevice)
Definition: accel.F90:929
subroutine device_info()
Definition: accel.F90:958
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4927
integer, parameter opencl_accelerator
Definition: accel.F90:402
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3602
integer, parameter opencl_default
Definition: accel.F90:402
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:282
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:303
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:2366
subroutine accel_kernel_global_end()
Definition: accel.F90:2141
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:2379
type(accel_kernel_t), target, save, public dkernel_dot_matrix
Definition: accel.F90:300
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:437
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4040
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2860
subroutine laccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4672
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:2120
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4770
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3511
type(accel_kernel_t), target, save, public zkernel_dot_matrix
Definition: accel.F90:301
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2580
type(accel_kernel_t), target, save, public zpack
Definition: accel.F90:291
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:307
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3647
subroutine laccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4493
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3288
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4814
subroutine daccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2477
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3115
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4275
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4838
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:304
integer, parameter cl_plat_nvidia
Definition: accel.F90:409
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3674
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:2239
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4325
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3920
subroutine daccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2738
integer, parameter cl_plat_ati
Definition: accel.F90:409
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:2308
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:1260
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:2258
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2778
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4001
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2637
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3190
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3250
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
Definition: accel.F90:302
type(accel_mem_t), save, public zm_1_buffer
Definition: accel.F90:277
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3486
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2973
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4713
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1452
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2951
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3231
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:1230
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4099
type(accel_kernel_t), target, save set_zero_int
Definition: accel.F90:315
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4298
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4230
subroutine, public accel_finish()
Definition: accel.F90:1407
subroutine opencl_check_bandwidth()
Definition: accel.F90:2061
subroutine accel_kernel_global_init()
Definition: accel.F90:2128
subroutine daccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2520
type(accel_kernel_t), target, save, public set_one
Definition: accel.F90:311
subroutine laccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4610
subroutine opencl_release_program(prog)
Definition: accel.F90:1703
type(accel_kernel_t), target, save set_zero
Definition: accel.F90:314
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3389
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4162
subroutine laccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4513
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4950
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:1379
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:310
type(accel_kernel_t), target, save, public kernel_density_spinors
Definition: accel.F90:297
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4789
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2797
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4418
subroutine daccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2539
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2884
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1995
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3579
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1399
integer, parameter, public accel_mem_read_write
Definition: accel.F90:187
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3410
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1898
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2996
subroutine accel_kernel_end(this)
Definition: accel.F90:2214
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3882
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4020
subroutine opencl_release_kernel(prog)
Definition: accel.F90:1720
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3535
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3128
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3429
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2336
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3766
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4186
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:2274
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:309
type(accel_kernel_t), target, save, public dpack
Definition: accel.F90:290
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3939
subroutine laccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4474
subroutine daccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2699
type(accel_kernel_t), target, save, public kernel_phase_spiral
Definition: accel.F90:299
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4905
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4208
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3171
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2835
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3448
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1491
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4553
subroutine laccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4692
type(accel_kernel_t), target, save, public kernel_copy_real_to_complex
Definition: accel.F90:289
subroutine opencl_build_program(prog, filename, flags)
Definition: accel.F90:1627
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4572
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:2162
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3841
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:447
subroutine, public accel_end(namespace)
Definition: accel.F90:1125
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3958
subroutine opencl_create_kernel(kernel, prog, name)
Definition: accel.F90:1738
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:2324
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4080
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:2291
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:1331
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:293
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3023
type(accel_kernel_t), target, save, public kernel_copy_complex_to_real
Definition: accel.F90:288
type(accel_kernel_t), target, save, public kernel_phase
Definition: accel.F90:298
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4882
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3467
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4751
type(accel_kernel_t), target, save, public kernel_zaxpy
Definition: accel.F90:286
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4732
integer, parameter cl_plat_amd
Definition: accel.F90:409
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3369
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:1251
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:2041
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2618
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3901
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:308
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4253
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2599
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3779
pure logical function, public accel_is_enabled()
Definition: accel.F90:427
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3307
integer, parameter cl_plat_intel
Definition: accel.F90:409
integer, parameter, public accel_mem_write_only
Definition: accel.F90:187
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2816
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2759
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4534
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3350
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:281
logical function f90_cl_device_has_extension(device, extension)
Definition: accel.F90:1977
subroutine opencl_print_error(ierr, name)
Definition: accel.F90:1760
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2928
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1572
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4977
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3269
type(accel_kernel_t), target, save, public kernel_copy
Definition: accel.F90:287
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3822
subroutine laccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4431
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3557
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2351
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:2464
subroutine daccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2656
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1589
subroutine laccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4653
integer, parameter opencl_cpu
Definition: accel.F90:402
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3210
integer function get_platform_id(platform_name)
Definition: accel.F90:1113
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1831
type(accel_mem_t), save, public dm_0_buffer
Definition: accel.F90:278
type(accel_t), public accel
Definition: accel.F90:274
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4860
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4591
subroutine daccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2718
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4137
subroutine daccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2559
integer, public cl_status
Definition: accel.F90:417
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3624
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:1273
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4061
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:292
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3861
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:2266
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1427
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1583
type(accel_mem_t), save, public dm_1_buffer
Definition: accel.F90:278
type(accel_mem_t), save, public zm_0_buffer
Definition: accel.F90:277
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2906
type(accel_kernel_t), pointer head
Definition: accel.F90:421
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4118
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
Definition: clblas.F90:193
real(real64), parameter, public m_zero
Definition: global.F90:188
complex(real64), parameter, public m_z0
Definition: global.F90:198
complex(real64), parameter, public m_z1
Definition: global.F90:199
real(real64), parameter, public m_one
Definition: global.F90:189
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:115
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:920
character(len=512), private msg
Definition: messages.F90:165
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:537
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1045
subroutine, public messages_new_line()
Definition: messages.F90:1134
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:414
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:616
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:434
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:623
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:552
type(type_t), public type_float
Definition: types.F90:133
type(type_t), public type_cmplx
Definition: types.F90:134
integer pure function, public types_get_size(this)
Definition: types.F90:152
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)