Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_null_ptr, c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
89
90 integer, public, parameter :: &
91 ACCEL_MEM_READ_ONLY = 0, &
94
96 ! Components are public by default
97#if defined(HAVE_CUDA)
98 type(c_ptr) :: cuda_context
99#else
100 integer :: dummy
101#endif
102 end type accel_context_t
103
104 type accel_device_t
105 ! Components are public by default
106#if defined(HAVE_CUDA)
107 type(c_ptr) :: cuda_device
108#else
109 integer :: dummy
110#endif
111 end type accel_device_t
112
113 type accel_t
114 ! Components are public by default
115 type(accel_context_t) :: context
116 type(accel_device_t) :: device
117 type(c_ptr) :: cublas_handle
118 type(c_ptr) :: cuda_stream
119 type(c_ptr) :: module_map
120 integer :: max_block_size
121 integer(int64) :: shared_memory_size
122 integer(int64) :: global_memory_size
123 logical :: enabled
124 logical :: allow_CPU_only
125 logical :: cuda_mpi
126 integer :: warp_size
127 integer(int64) :: initialize_buffers
128 character(len=32) :: debug_flag
129 integer(int64) :: max_block_dim(3)
130 integer(int64) :: max_grid_dim(3)
131 end type accel_t
132
133 type accel_mem_t
134 ! Components are public by default
135 type(c_ptr) :: mem
136 integer(c_size_t) :: size = 0
137 type(type_t) :: type
138 integer :: flags = 0
139 logical :: allocated = .false.
140 end type accel_mem_t
141
142 type accel_kernel_t
143 ! Components are public by default
144#ifdef HAVE_CUDA
145 type(c_ptr) :: cuda_kernel
146 type(c_ptr) :: cuda_module
147 type(c_ptr) :: arguments
148#endif
149 logical :: initialized = .false.
150 type(accel_kernel_t), pointer :: next
151 integer :: arg_count
152 character(len=128) :: kernel_name
153 end type accel_kernel_t
154
155 type(accel_t), public :: accel
156
157 ! Global variables defined on device
158 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
160
161 ! the kernels
162 type(accel_kernel_t), public, target, save :: kernel_vpsi
163 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
164 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
165 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t), public, target, save :: kernel_daxpy
167 type(accel_kernel_t), public, target, save :: kernel_zaxpy
168 type(accel_kernel_t), public, target, save :: kernel_copy
169 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
170 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
171 type(accel_kernel_t), public, target, save :: dpack
172 type(accel_kernel_t), public, target, save :: zpack
173 type(accel_kernel_t), public, target, save :: dunpack
174 type(accel_kernel_t), public, target, save :: zunpack
175 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
176 type(accel_kernel_t), public, target, save :: kernel_density_real
177 type(accel_kernel_t), public, target, save :: kernel_density_complex
178 type(accel_kernel_t), public, target, save :: kernel_density_spinors
179 type(accel_kernel_t), public, target, save :: kernel_phase
180 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
181 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
182 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
183 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
184 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
185 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
186 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
187 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
188 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
189 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
190 type(accel_kernel_t), public, target, save :: dzmul
191 type(accel_kernel_t), public, target, save :: zzmul
192
193 interface accel_grid_size
195 end interface accel_grid_size
196
200
201 interface accel_padded_size
203 end interface accel_padded_size
205 interface accel_create_buffer
207 end interface accel_create_buffer
209 interface accel_kernel_run
211 end interface accel_kernel_run
240 end interface accel_write_buffer
241
242 interface accel_read_buffer
257 end interface accel_read_buffer
260 module procedure &
284 module procedure &
290
292 module procedure &
298
299 integer :: buffer_alloc_count
300 integer(int64) :: allocated_mem
301 type(accel_kernel_t), pointer :: head
302 type(alloc_cache_t) :: memcache
303
304contains
305
306 pure logical function accel_is_enabled() result(enabled)
307#ifdef HAVE_ACCEL
308 enabled = accel%enabled
309#else
310 enabled = .false.
311#endif
312 end function accel_is_enabled
313
314 ! ------------------------------------------
315
316 pure logical function accel_allow_cpu_only() result(allow)
317#ifdef HAVE_ACCEL
318 allow = accel%allow_CPU_only
319#else
320 allow = .true.
321#endif
322 end function accel_allow_cpu_only
323
324 ! ------------------------------------------
325
326 subroutine accel_init(base_grp, namespace)
327 type(mpi_grp_t), intent(inout) :: base_grp
328 type(namespace_t), intent(in) :: namespace
329
330 logical :: disable, default, run_benchmark
331 integer :: idevice
332#ifdef HAVE_CUDA
333 integer :: dim
334#ifdef HAVE_MPI
335 character(len=256) :: sys_name
336#endif
337#endif
338
339 push_sub(accel_init)
340
341 buffer_alloc_count = 0
342
343 !%Variable DisableAccel
344 !%Type logical
345 !%Default yes
346 !%Section Execution::Accel
347 !%Description
348 !% If Octopus was compiled with CUDA support, it will
349 !% try to initialize and use an accelerator device. By setting this
350 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
351 !%End
352 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
353#ifdef HAVE_ACCEL
354 default = .false.
355#else
356 default = .true.
357#endif
358 call parse_variable(namespace, 'DisableAccel', default, disable)
359 accel%enabled = .not. disable
360
361#ifndef HAVE_ACCEL
362 if (accel%enabled) then
363 message(1) = 'Octopus was compiled without Cuda support.'
364 call messages_fatal(1)
365 end if
366#endif
367
368 if (.not. accel_is_enabled()) then
369 pop_sub(accel_init)
370 return
371 end if
372
373 call messages_obsolete_variable(namespace, 'AccelPlatform')
374 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
375
376 !%Variable AccelDevice
377 !%Type integer
378 !%Default 0
379 !%Section Execution::Accel
380 !%Description
381 !% This variable selects the GPU that Octopus will use. You can specify a
382 !% numerical id to select a specific device.
383 !%
384 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
385 !% starting at this value.
386 !%End
387 call parse_variable(namespace, 'AccelDevice', 0, idevice)
388
389 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
390
391 if (idevice < 0) then
392 call messages_write('Invalid AccelDevice')
393 call messages_fatal()
394 end if
396 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
398#ifdef HAVE_CUDA
399 if (idevice<0) idevice = 0
400 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
401 idevice, base_grp%rank)
402#ifdef HAVE_MPI
403 call loct_sysname(sys_name)
404 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
405 " on ", trim(sys_name)
406 call messages_info(1, all_nodes = .true.)
407#endif
408
409 call cublas_init(accel%cublas_handle, accel%cuda_stream)
410#endif
412
413 ! Get some device information that we will need later
414#ifdef HAVE_CUDA
415 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
416 call cuda_device_shared_memory(accel%device%cuda_device, accel%shared_memory_size)
417 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_block_size)
418 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
419 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
420 accel%max_block_dim(1) = int(dim, int64)
421 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
422 accel%max_block_dim(2) = int(dim, int64)
423 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
424 accel%max_block_dim(3) = int(dim, int64)
425 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
426 accel%max_grid_dim(1) = int(dim, int64)
427 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
428 accel%max_grid_dim(2) = int(dim, int64)
429 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
430 accel%max_grid_dim(3) = int(dim, int64)
431#endif
432
433 if (base_grp%is_root()) call device_info()
434
435 ! initialize the cache used to speed up allocations
436 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
437
438 ! now initialize the kernels
440
441#if defined(HAVE_HIP)
442 accel%debug_flag = "-g"
443#elif defined(HAVE_CUDA)
444 accel%debug_flag = "-lineinfo"
445#endif
446
447 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
448 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
449 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
450 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
451 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
452 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
453 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
454 flags = ' -DRTYPE_DOUBLE')
455 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
456 flags = '-DRTYPE_COMPLEX')
457 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
458 flags = '-DRTYPE_DOUBLE')
459 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
460 flags = '-DRTYPE_COMPLEX')
461 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
462 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
463 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
464 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
465 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
466 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
467 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
468 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
469 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
470 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
471 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
472 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
473 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
474 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
475 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
476 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
477 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
478
479
480 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
481 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
482
483 ! Define global buffers
484 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
485 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
486 call accel_write_buffer(zm_0_buffer, m_z0)
487 end if
488 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
489 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
490 call accel_write_buffer(zm_1_buffer, m_z1)
491 end if
492 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
493 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
494 call accel_write_buffer(dm_0_buffer, m_zero)
495 end if
496 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
497 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
498 call accel_write_buffer(dm_1_buffer, m_one)
499 end if
500
501
502 !%Variable AccelBenchmark
503 !%Type logical
504 !%Default no
505 !%Section Execution::Accel
506 !%Description
507 !% If this variable is set to yes, Octopus will run some
508 !% routines to benchmark the performance of the accelerator device.
509 !%End
510 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
511
512 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
513
514 if (run_benchmark) then
516 end if
517
518 !%Variable GPUAwareMPI
519 !%Type logical
520 !%Section Execution::Accel
521 !%Description
522 !% If Octopus was compiled with GPU support and MPI support and if the MPI
523 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
524 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
525 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
526 !% going through the host memory.
527 !% The default is false, except when the configure switch --enable-cudampi is set, in which
528 !% case this variable is set to true.
529 !%End
530#ifdef HAVE_CUDA_MPI
531 default = .true.
532#else
533 default = .false.
534#endif
535 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
536 if (accel%cuda_mpi) then
537#ifndef HAVE_CUDA_MPI
538 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
539 call messages_warning()
540#endif
541 call messages_write("Using GPU-aware MPI.")
542 call messages_info()
543 end if
544
545
546 !%Variable AllowCPUonly
547 !%Type logical
548 !%Section Execution::Accel
549 !%Description
550 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
551 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
552 !% code execution also in these cases.
553 !%End
554#if defined (HAVE_ACCEL)
555 default = .false.
556#else
557 default = .true.
558#endif
559 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
560
561
562 !%Variable InitializeGPUBuffers
563 !%Type integer
564 !%Default no
565 !%Section Execution::Accel
566 !%Description
567 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
568 !%Option no 0
569 !% Do not initialize GPU buffers.
570 !%Option yes 1
571 !% Initialize GPU buffers to zero.
572 !%Option nan 2
573 !% Initialize GPU buffers to nan.
574 !%End
575 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
576 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
577 call messages_input_error(namespace, 'InitializeGPUBuffers')
578 end if
579
580
581 call messages_print_with_emphasis(namespace=namespace)
582
583 pop_sub(accel_init)
584
585 contains
586
587 subroutine device_info()
588#ifdef HAVE_CUDA
589 integer :: version
590 character(kind=c_char) :: cval_str(257)
591#endif
592 integer :: major, minor
593 character(len=256) :: val_str
594
595 push_sub(accel_init.device_info)
596
597 call messages_new_line()
598 call messages_write('Selected device:')
599 call messages_new_line()
600
601#ifdef HAVE_CUDA
602#ifdef __HIP_PLATFORM_AMD__
603 call messages_write(' Framework : ROCm')
604#else
605 call messages_write(' Framework : CUDA')
606#endif
607#endif
608 call messages_info()
609
610#ifdef HAVE_CUDA
611 call messages_write(' Device type : GPU', new_line = .true.)
612#ifdef __HIP_PLATFORM_AMD__
613 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
614#else
615 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
616#endif
617#endif
618
619#ifdef HAVE_CUDA
620 cval_str = c_null_char
621 call cuda_device_name(accel%device%cuda_device, cval_str)
622 call string_c_to_f(cval_str, val_str)
623#endif
624 call messages_write(' Device name : '//trim(val_str))
625 call messages_new_line()
626
627#ifdef HAVE_CUDA
628 call cuda_device_capability(accel%device%cuda_device, major, minor)
629#endif
630 call messages_write(' Cuda capabilities :')
631 call messages_write(major, fmt = '(i2)')
632 call messages_write('.')
633 call messages_write(minor, fmt = '(i1)')
634 call messages_new_line()
635
636 ! VERSION
637#ifdef HAVE_CUDA
638 call cuda_driver_version(version)
639 call messages_write(' Driver version : ')
640 call messages_write(version)
641#endif
642 call messages_new_line()
643
644
645 call messages_write(' Device memory :')
646 call messages_write(accel%global_memory_size, units=unit_megabytes)
647 call messages_new_line()
648
649 call messages_write(' Shared memory :')
650 call messages_write(accel%shared_memory_size, units=unit_kilobytes)
651 call messages_new_line()
652
653 call messages_write(' Max. block size :')
654 call messages_write(accel%max_block_size)
655 call messages_new_line()
656
657 call messages_info()
658
659 pop_sub(accel_init.device_info)
660 end subroutine device_info
661
662 end subroutine accel_init
663
664 ! ------------------------------------------
665 subroutine accel_end(namespace)
666 type(namespace_t), intent(in) :: namespace
667
668 integer(int64) :: hits, misses
669 real(real64) :: volume_hits, volume_misses
670 logical :: found
671 type(accel_mem_t) :: tmp
672
673 push_sub(accel_end)
674
675 if (accel_is_enabled()) then
676
677 ! Release global buffers
678 call accel_free_buffer(zm_0_buffer)
679 call accel_free_buffer(zm_1_buffer)
680 call accel_free_buffer(dm_0_buffer)
681 call accel_free_buffer(dm_1_buffer)
682
683 do
684 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
685 if (.not. found) exit
686
687#ifdef HAVE_CUDA
688 call cuda_mem_free(tmp%mem)
689#endif
690 end do
691
692 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
693
694 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
695
696 call messages_new_line()
697 call messages_write(' Number of allocations =')
698 call messages_write(hits + misses, new_line = .true.)
699 call messages_write(' Volume of allocations =')
700 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
701 new_line = .true.)
702 call messages_write(' Hit ratio =')
703 if (hits + misses > 0) then
704 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
705 else
706 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
707 end if
708 call messages_write('%', new_line = .true.)
709 call messages_write(' Volume hit ratio =')
710 if (volume_hits + volume_misses > 0) then
711 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
712 else
713 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
714 end if
715 call messages_write('%')
716 call messages_new_line()
717 call messages_info()
718
719 call messages_print_with_emphasis(namespace=namespace)
720 end if
721
723
724 if (accel_is_enabled()) then
725#ifdef HAVE_CUDA
726 call cublas_end(accel%cublas_handle)
727 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
728 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
729 end if
730#endif
731
732 if (buffer_alloc_count /= 0) then
733 call messages_write('Accel:')
734 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
735 call messages_write(' in ')
736 call messages_write(buffer_alloc_count)
737 call messages_write(' buffers were not deallocated.')
738 call messages_fatal()
739 end if
740
741 end if
742
743 pop_sub(accel_end)
744 end subroutine accel_end
745
746 ! ------------------------------------------
747
749 subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
750 integer(int64), intent(in) :: n(:)
751 integer(int64), intent(in) :: blocksizes(:)
752 integer(int64), intent(out) :: gridsizes(:)
753
754 integer :: dim, i
755
756 dim = ubound(n, dim=1)
757 assert(dim == ubound(blocksizes, dim=1))
758 assert(dim == ubound(gridsizes, dim=1))
759
760 do i = 1, dim
761 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
762 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
763 end do
764 end subroutine accel_grid_size_array_i8
765
766 ! ------------------------------------------
767
769 subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
770 integer, intent(in) :: n(:)
771 integer, intent(in) :: blocksizes(:)
772 integer, intent(out) :: gridsizes(:)
773
774 integer(int64) :: gridsizes64(size(gridsizes))
775
776 call accel_grid_size_array_i8(int(n(:), int64), int(blocksizes(:), int64), gridsizes64)
777
778 gridsizes = int(gridsizes64, int32)
779 end subroutine accel_grid_size_array_i4
780
781 ! ------------------------------------------
782
784 subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
785 integer(int64), intent(in) :: n
786 integer(int64), intent(in) :: blocksizes
787 integer(int64), intent(out) :: gridsizes
788
789 integer(int64) :: temp(1)
791 call accel_grid_size_array_i8( (/n/), (/blocksizes/), temp)
792
793 gridsizes = temp(1)
794 end subroutine accel_grid_size_i8
795
796 ! ------------------------------------------
797
799 subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
800 integer, intent(in) :: n
801 integer, intent(in) :: blocksizes
802 integer, intent(out) :: gridsizes
803
804 integer(int64) :: temp(1)
805
806 call accel_grid_size_array_i8(int((/n/), int64), int((/blocksizes/), int64), temp)
807
808 gridsizes = int(temp(1), int32)
809 end subroutine accel_grid_size_i4
811! ------------------------------------------
812
816 subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
817 integer(int64), intent(in) :: n
818 integer(int64), intent(in) :: pack_size
819 integer(int64), dimension(3), intent(out) :: gridsizes
820 integer(int64), dimension(3), intent(out) :: blocksizes
821 type(accel_kernel_t), optional, intent(inout) :: kernel
822
823 integer(int64) :: bsize, dim2, dim3
824 integer(int64), dimension(3) :: nn
826 if(present(kernel)) then
827 bsize = accel_kernel_block_size(kernel)/pack_size
828 else
829 bsize = accel_max_block_size()/pack_size
830 end if
831
832 dim3 = n/(accel_max_size_per_dim(2)*bsize) + 1
833 dim2 = min(accel_max_size_per_dim(2)*bsize, pad(n, bsize))
834
835 nn = (/pack_size, dim2, dim3/)
836 blocksizes = (/pack_size, bsize, 1_int64/)
837
838 call accel_grid_size(nn, blocksizes, gridsizes)
839 end subroutine accel_grid_size_extend_dim_i8
841 ! ------------------------------------------
842
846 subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
847 integer, intent(in) :: n
848 integer, intent(in) :: pack_size
849 integer, dimension(3), intent(out) :: gridsizes
850 integer, dimension(3), intent(out) :: blocksizes
851 type(accel_kernel_t), optional, intent(inout) :: kernel
852
853 integer(int64) :: gridsizes64(3), blocksizes64(3)
854
855 call accel_grid_size_extend_dim_i8(int(n, int64), int(pack_size, int64), &
856 gridsizes64, blocksizes64, kernel=kernel)
858 gridsizes = int(gridsizes64, int32)
859 blocksizes = int(blocksizes64, int32)
860 end subroutine accel_grid_size_extend_dim_i4
861
862 ! ------------------------------------------
863
864 integer(int64) function accel_padded_size_i8(nn) result(psize)
865 integer(int64), intent(in) :: nn
866
867 integer(int64) :: modnn, bsize
868
869 psize = nn
870
871 if (accel_is_enabled()) then
872
873 bsize = accel_max_block_size()
874
875 psize = nn
876 modnn = mod(nn, bsize)
877 if (modnn /= 0) psize = psize + bsize - modnn
878
879 end if
880
881 end function accel_padded_size_i8
882
883 ! ------------------------------------------
884
885 integer(int32) function accel_padded_size_i4(nn) result(psize)
886 integer(int32), intent(in) :: nn
888 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
889
890 end function accel_padded_size_i4
891
892 ! ------------------------------------------
893
894 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
895 type(accel_mem_t), intent(inout) :: this
896 integer, intent(in) :: flags
897 type(type_t), intent(in) :: type
898 integer, intent(in) :: size
899 logical, optional, intent(in) :: set_zero
900 logical, optional, intent(in) :: async
901
902 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
903 end subroutine accel_create_buffer_4
904
905 ! ------------------------------------------
906
907 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
908 type(accel_mem_t), intent(inout) :: this
909 integer, intent(in) :: flags
910 type(type_t), intent(in) :: type
911 integer(int64), intent(in) :: size
912 logical, optional, intent(in) :: set_zero
913 logical, optional, intent(in) :: async
914
915 integer(int64) :: fsize
916 logical :: found
917 integer(int64) :: initialize_buffers
918
919 push_sub(accel_create_buffer_8)
920
921 this%type = type
922 this%size = size
923 this%flags = flags
924 fsize = int(size, int64)*types_get_size(type)
925 this%allocated = .true.
927 if (fsize > 0) then
928
929 call alloc_cache_get(memcache, fsize, found, this%mem)
930
931 if (.not. found) then
932#ifdef HAVE_CUDA
933 if(optional_default(async, .false.)) then
934 call cuda_mem_alloc_async(this%mem, fsize)
935 else
936 call cuda_mem_alloc(this%mem, fsize)
937 end if
938#endif
939 end if
940
941 buffer_alloc_count = buffer_alloc_count + 1
942 allocated_mem = allocated_mem + fsize
943
944 end if
945
946 if (present(set_zero)) then
947 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
948 else
949 initialize_buffers = accel%initialize_buffers
950 end if
951 select case (initialize_buffers)
952 case (option__initializegpubuffers__yes)
953 call accel_set_buffer_to(this, type, int(z'00', int8), size)
954 case (option__initializegpubuffers__nan)
955 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
956 end select
957
958 pop_sub(accel_create_buffer_8)
959 end subroutine accel_create_buffer_8
960
961 ! ------------------------------------------
962
963 subroutine accel_free_buffer(this, async)
964 type(accel_mem_t), intent(inout) :: this
965 logical, optional, intent(in) :: async
966
967 logical :: put
968 integer(int64) :: fsize
969
970 push_sub(accel_free_buffer)
971
972 if (this%size > 0) then
973
974 fsize = int(this%size, int64)*types_get_size(this%type)
975
976 call alloc_cache_put(memcache, fsize, this%mem, put)
977
978 if (.not. put) then
979#ifdef HAVE_CUDA
980 if (optional_default(async, .false.)) then
981 call cuda_mem_free_async(this%mem)
982 else
983 call cuda_mem_free(this%mem)
984 end if
985#endif
986 end if
987
988 buffer_alloc_count = buffer_alloc_count - 1
989 allocated_mem = allocated_mem + fsize
990
991 end if
992
993 this%size = 0
994 this%flags = 0
995
996 this%allocated = .false.
997
998 pop_sub(accel_free_buffer)
999 end subroutine accel_free_buffer
1000
1001 ! ------------------------------------------
1002
1007 subroutine accel_detach_buffer(this)
1008 type(accel_mem_t), intent(inout) :: this
1009
1010 push_sub(accel_detach_buffer)
1011
1012 this%mem = c_null_ptr
1013 this%size = 0
1014 this%type = type_none
1015 this%flags = 0
1016 this%allocated = .false.
1017
1018 pop_sub(accel_detach_buffer)
1019 end subroutine accel_detach_buffer
1020
1021 ! ------------------------------------------------------
1022
1023 ! Check if the temporary buffers are the right size, if not reallocate them
1024 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1025 type(accel_mem_t), intent(inout) :: buffer
1026 integer, intent(in) :: flags
1027 type(type_t), intent(in) :: type
1028 integer, intent(in) :: required_size
1029 logical, intent(in) :: set_zero
1030 logical, optional, intent(in) :: async
1031
1032 push_sub(accel_ensure_buffer_size)
1033
1034
1035 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
1036 call accel_free_buffer(buffer, async=optional_default(async, .false.))
1037 end if
1038
1039 if (.not. accel_buffer_is_allocated(buffer)) then
1040 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1041 end if
1042
1044 end subroutine accel_ensure_buffer_size
1045
1046 ! ------------------------------------------
1047
1048 logical pure function accel_buffer_is_allocated(this) result(allocated)
1049 type(accel_mem_t), intent(in) :: this
1050
1051 allocated = this%allocated
1052 end function accel_buffer_is_allocated
1053
1054 ! -----------------------------------------
1055
1056 subroutine accel_finish()
1057 ! no push_sub, called too frequently
1058
1059 if (accel_is_enabled()) then
1060#ifdef HAVE_CUDA
1062#endif
1063 end if
1064 end subroutine accel_finish
1066 ! ------------------------------------------
1067
1068 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1069 type(accel_kernel_t), intent(inout) :: kernel
1070 integer, intent(in) :: narg
1071 type(accel_mem_t), intent(in) :: buffer
1072
1073 assert(accel_buffer_is_allocated(buffer))
1074
1075 ! no push_sub, called too frequently
1076#ifdef HAVE_CUDA
1077 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1078#endif
1079
1080 end subroutine accel_set_kernel_arg_buffer
1081
1082 ! ------------------------------------------
1083
1090 subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
1091 type(accel_kernel_t), intent(inout) :: kernel
1092 integer(int64), intent(in) :: gridsizes(:)
1093 integer(int64), intent(in) :: blocksizes(:)
1094 integer(int64), optional, intent(in) :: shared_memory_size
1095
1096 integer :: dim
1097 integer(int64) :: gsizes(1:3)
1098 integer(int64) :: bsizes(1:3)
1099
1100 ! no push_sub, called too frequently
1101
1102 ! CUDA needs all dimensions
1103 gsizes = 1
1104 bsizes = 1
1105
1106 dim = ubound(gridsizes, dim=1)
1107
1108 assert(dim == ubound(blocksizes, dim=1))
1110 ! if one size is zero, there is nothing to do
1111 if (any(gridsizes == 0)) return
1112
1113 assert(all(blocksizes > 0))
1114
1115 gsizes(1:dim) = gridsizes(1:dim)
1116 bsizes(1:dim) = blocksizes(1:dim)
1117
1118#ifdef HAVE_CUDA
1119 ! Maximum dimension of a block
1120 if (any(bsizes(1:3) > accel%max_block_dim(1:3))) then
1121 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1122 message(2) = "The following conditions should be fulfilled:"
1123 write(message(3), "(A, I8, A, I8)") "Dim 1: ", bsizes(1), " <= ", accel%max_block_dim(1)
1124 write(message(4), "(A, I8, A, I8)") "Dim 2: ", bsizes(2), " <= ", accel%max_block_dim(2)
1125 write(message(5), "(A, I8, A, I8)") "Dim 3: ", bsizes(3), " <= ", accel%max_block_dim(3)
1126 message(6) = "This is an internal error, please contact the developers."
1127 call messages_fatal(6)
1128 end if
1129
1130
1131 ! Maximum number of threads per block
1132 if (product(bsizes) > accel_max_block_size()) then
1133 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1134 message(2) = "The following condition should be fulfilled:"
1135 write(message(3), "(I8, A, I8)") product(bsizes), " <= ", accel_max_block_size()
1136 message(4) = "This is an internal error, please contact the developers."
1137 call messages_fatal(4)
1138 end if
1139
1140 ! Maximum dimensions of the grid of thread block
1141 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1142 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1143 message(2) = "The following conditions should be fulfilled:"
1144 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1145 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1146 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1147 message(6) = "This is an internal error, please contact the developers."
1148 call messages_fatal(6)
1149 end if
1150
1151 if(present(shared_memory_size)) then
1152
1153 if (shared_memory_size > accel%shared_memory_size) then
1154 message(1) = "Shared memory too large in kernel "//trim(kernel%kernel_name)
1155 message(2) = "The following condition should be fulfilled:"
1156 message(3) = "Requested shared memory <= Available shared memory"
1157 write(message(4), '(a,f12.6,a)') "Requested shared memory: ", real(shared_memory_size, real64) /1024.0, " Kb"
1158 write(message(5), '(a,f12.6,a)') "Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0, " Kb"
1159 message(6) = "This is an internal error, please contact the developers."
1160 call messages_fatal(6)
1161 else if (shared_memory_size <= 0) then
1162 message(1) = "Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1163 write(message(2), '(a,f12.6,a)') "Shared memory size requested: ", real(shared_memory_size, real64) /1024.0, " Kb"
1164 message(3) = "This is an internal error, please contact the developers."
1165 call messages_fatal(3)
1166 end if
1167 end if
1168
1169 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), bsizes(1), &
1170 optional_default(shared_memory_size, 0_int64), kernel%arguments)
1171#endif
1172
1173 end subroutine accel_kernel_run_8
1174
1175 ! -----------------------------------------------
1176
1182
1183 subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
1184 type(accel_kernel_t), intent(inout) :: kernel
1185 integer, intent(in) :: gridsizes(:)
1186 integer, intent(in) :: blocksizes(:)
1187 integer(int64), optional, intent(in) :: shared_memory_size
1188
1189 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1190
1191 end subroutine accel_kernel_run_4
1192
1193 ! -----------------------------------------------
1194
1195 integer pure function accel_max_block_size() result(max_block_size)
1196 max_block_size = accel%max_block_size
1197 end function accel_max_block_size
1198
1199 ! -----------------------------------------------
1201 integer function accel_kernel_block_size(kernel) result(block_size)
1202 type(accel_kernel_t), intent(inout) :: kernel
1203
1204#ifdef HAVE_CUDA
1205 integer :: max_block_size
1206#endif
1207
1208 block_size = 0
1209
1210#ifdef HAVE_CUDA
1211 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1212 if (debug%info .and. max_block_size /= accel%max_block_size) then
1213 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", max_block_size, ")", &
1214 "than available on the device (", accel%max_block_size, ")"
1215 call messages_info(1)
1216 end if
1217
1218 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1219 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1220 block_size = 256
1221
1222 ! make sure we do not use more threads per block than available for this kernel
1223 block_size = min(block_size, max_block_size)
1224#endif
1225
1226 end function accel_kernel_block_size
1227
1228 ! ----------------------------------------------------
1229
1230 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1231 type(accel_mem_t), intent(inout) :: buffer
1232 type(type_t), intent(in) :: type
1233 integer(int8), intent(in) :: val
1234 integer(int64), intent(in) :: nval
1235 integer(int64), optional, intent(in) :: offset
1236 logical, optional, intent(in) :: async
1237
1238 integer(int64) :: nval_, offset_, type_size
1239
1240 push_sub(accel_set_buffer_to)
1242 if (nval == 0) then
1243 pop_sub(accel_set_buffer_to)
1244 return
1245 end if
1246 assert(nval > 0)
1247
1248 if (present(offset)) then
1249 assert(offset >= 0)
1250 if(offset > buffer%size) then
1251 pop_sub(accel_set_buffer_to)
1252 return
1253 end if
1254 end if
1255
1256 type_size = types_get_size(type)
1258 nval_ = nval*type_size
1259
1260 offset_ = 0_int64
1261 if (present(offset)) offset_ = offset*type_size
1262
1263 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1264 if(.not. optional_default(async, .false.)) call accel_finish()
1265
1266 pop_sub(accel_set_buffer_to)
1267 end subroutine accel_set_buffer_to
1268
1269 ! ----------------------------------------------------
1270
1271 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1272 type(accel_mem_t), intent(inout) :: buffer
1273 type(type_t), intent(in) :: type
1274 integer(int64), intent(in) :: nval
1275 integer(int64), optional, intent(in) :: offset
1276 logical, optional, intent(in) :: async
1279
1280 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1281
1283 end subroutine accel_set_buffer_to_zero_i8
1284
1285 ! ----------------------------------------------------
1286
1287 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1288 type(accel_mem_t), intent(inout) :: buffer
1289 type(type_t), intent(in) :: type
1290 integer(int32), intent(in) :: nval
1291 integer(int32), optional, intent(in) :: offset
1292 logical, optional, intent(in) :: async
1293
1295
1296 if (present(offset)) then
1297 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1298 else
1299 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1300 end if
1301
1303 end subroutine accel_set_buffer_to_zero_i4
1304
1305 ! ----------------------------------------------------
1306
1307 subroutine accel_check_bandwidth()
1308 integer :: itime
1309 integer, parameter :: times = 10
1310 integer :: size
1311 real(real64) :: time, stime
1312 real(real64) :: read_bw, write_bw
1313 type(accel_mem_t) :: buff
1314 real(real64), allocatable :: data(:)
1315
1316 call messages_new_line()
1317 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1318 call messages_new_line()
1319 call messages_info()
1320
1321 call messages_write(' Buffer size Read bw Write bw')
1322 call messages_new_line()
1323 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1324 call messages_info()
1325
1326 size = 15000
1327 do
1328 safe_allocate(data(1:size))
1329 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1330
1331 stime = loct_clock()
1332 do itime = 1, times
1333 call accel_write_buffer(buff, size, data)
1334 call accel_finish()
1335 end do
1336 time = (loct_clock() - stime)/real(times, real64)
1337
1338 write_bw = real(size, real64) *8.0_real64/time
1339
1340 stime = loct_clock()
1341 do itime = 1, times
1342 call accel_read_buffer(buff, size, data)
1343 end do
1344 call accel_finish()
1345
1346 time = (loct_clock() - stime)/real(times, real64)
1347 read_bw = real(size, real64) *8.0_real64/time
1348
1349 call messages_write(size*8.0_real64/1024.0_real64**2)
1350 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1351 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1352 call messages_info()
1353
1354 call accel_free_buffer(buff)
1355
1356 safe_deallocate_a(data)
1357
1358 size = int(size*2.0)
1359
1360 if (size > 50000000) exit
1361 end do
1362 end subroutine accel_check_bandwidth
1363
1364 !------------------------------------------------------------
1365
1366 subroutine accel_kernel_global_init()
1367
1368 push_sub(accel_kernel_global_init)
1369
1370 nullify(head)
1371
1372 call cuda_module_map_init(accel%module_map)
1373
1375 end subroutine accel_kernel_global_init
1376
1377 !------------------------------------------------------------
1378
1379 subroutine accel_kernel_global_end()
1380 type(accel_kernel_t), pointer :: next_head
1381
1382 push_sub(accel_kernel_global_end)
1383
1384 do
1385 if (.not. associated(head)) exit
1386 next_head => head%next
1388 head => next_head
1389 end do
1390
1391 if (accel_is_enabled()) then
1392 call cuda_module_map_end(accel%module_map)
1393 end if
1396 end subroutine accel_kernel_global_end
1397
1398 !------------------------------------------------------------
1399
1400 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1401 type(accel_kernel_t), intent(inout) :: this
1402 character(len=*), intent(in) :: file_name
1403 character(len=*), intent(in) :: kernel_name
1404 character(len=*), optional, intent(in) :: flags
1405
1406#ifdef HAVE_CUDA
1407 character(len=1000) :: all_flags
1408#endif
1409
1410 push_sub(accel_kernel_build)
1411
1412 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1413
1414#ifdef HAVE_CUDA
1415 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1416
1417 if (present(flags)) then
1418 all_flags = trim(all_flags)//' '//trim(flags)
1419 end if
1420
1421 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1422 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1423
1424 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1425 call cuda_alloc_arg_array(this%arguments)
1426#endif
1427
1428 this%initialized = .true.
1429 this%kernel_name = trim(kernel_name)
1430
1431 call profiling_out("ACCEL_COMPILE")
1432
1433 pop_sub(accel_kernel_build)
1434 end subroutine accel_kernel_build
1435
1436 !------------------------------------------------------------
1437
1438 subroutine accel_kernel_end(this)
1439 type(accel_kernel_t), intent(inout) :: this
1440
1441 push_sub(accel_kernel_end)
1442
1443#ifdef HAVE_CUDA
1444 call cuda_free_arg_array(this%arguments)
1445 call cuda_release_kernel(this%cuda_kernel)
1446 ! modules are not released here, since they are not associated to a kernel
1447#endif
1448
1449 this%initialized = .false.
1450
1451 pop_sub(accel_kernel_end)
1452 end subroutine accel_kernel_end
1453
1454 !------------------------------------------------------------
1455
1456 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1457 type(accel_kernel_t), target, intent(inout) :: this
1458 character(len=*), intent(in) :: file_name
1459 character(len=*), intent(in) :: kernel_name
1460 character(len=*), optional, intent(in) :: flags
1461
1462 push_sub(accel_kernel_start_call)
1463
1464 if (.not. this%initialized) then
1465 call accel_kernel_build(this, file_name, kernel_name, flags)
1466 this%next => head
1467 head => this
1468 end if
1469
1471 end subroutine accel_kernel_start_call
1472
1473 !--------------------------------------------------------------
1474
1475 integer(int64) pure function accel_global_memory_size() result(size)
1476
1477 size = accel%global_memory_size
1478
1479 end function accel_global_memory_size
1480
1481 !--------------------------------------------------------------
1482
1483 integer(int64) pure function accel_shared_memory_size() result(size)
1484
1485 size = accel%shared_memory_size
1486
1487 end function accel_shared_memory_size
1488 !--------------------------------------------------------------
1489
1490 integer pure function accel_max_size_per_dim(dim) result(size)
1491 integer, intent(in) :: dim
1492
1493 size = 0
1494#ifdef HAVE_CUDA
1495 size = 32768
1496 if (dim == 1) size = 2**30
1497#endif
1498 end function accel_max_size_per_dim
1499
1500 ! ------------------------------------------------------
1501
1502 subroutine accel_set_stream(stream_number)
1503 integer, intent(in) :: stream_number
1504
1505 push_sub(accel_set_stream)
1506
1507 if (accel_is_enabled()) then
1508#ifdef HAVE_CUDA
1509 call cuda_set_stream(accel%cuda_stream, stream_number)
1510 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1511#endif
1512 end if
1513
1514 pop_sub(accel_set_stream)
1515 end subroutine accel_set_stream
1516
1517 ! ------------------------------------------------------
1519 subroutine accel_get_stream(stream_number)
1520 integer, intent(inout) :: stream_number
1521
1522 push_sub(accel_get_stream)
1523
1524 if (accel_is_enabled()) then
1525#ifdef HAVE_CUDA
1526 call cuda_get_stream(stream_number)
1527#endif
1528 end if
1529
1530 pop_sub(accel_get_stream)
1531 end subroutine accel_get_stream
1532
1533 ! ------------------------------------------------------
1534
1537
1538 if (accel_is_enabled()) then
1539#ifdef HAVE_CUDA
1540 call cuda_synchronize_all_streams()
1541#endif
1542 end if
1543
1545 end subroutine accel_synchronize_all_streams
1547 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1548 type(c_ptr), intent(in) :: buffer
1549 integer(int64), intent(in) :: offset
1550 type(c_ptr) :: buffer_offset
1551
1553#ifdef HAVE_CUDA
1554 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1555#else
1556 ! this is needed to make the compiler happy for non-GPU compilations
1557 buffer_offset = buffer
1558#endif
1561
1562 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1563 type(c_ptr), intent(in) :: buffer
1564 integer(int64), intent(in) :: offset
1565 type(c_ptr) :: buffer_offset
1566
1568#ifdef HAVE_CUDA
1569 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1570#else
1571 ! this is needed to make the compiler happy for non-GPU compilations
1572 buffer_offset = buffer
1573#endif
1576
1577 subroutine accel_clean_pointer(buffer)
1578 type(c_ptr), intent(in) :: buffer
1579
1580 push_sub(accel_clean_pointer)
1581#ifdef HAVE_CUDA
1582 call cuda_clean_pointer(buffer)
1583#endif
1584 pop_sub(accel_clean_pointer)
1585 end subroutine accel_clean_pointer
1586
1590 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1591 integer(int64), intent(in) :: size
1592 integer(int64), intent(out) :: grid_size
1593 integer(int64), intent(out) :: thread_block_size
1594
1595 push_sub(accel_get_unfolded_size)
1596#ifdef __HIP_PLATFORM_AMD__
1597 ! not benefitial for AMD chips
1598 grid_size = 1_int64
1599 thread_block_size = size
1600#else
1601 grid_size = size
1602 thread_block_size = accel%warp_size
1603#endif
1605 end subroutine accel_get_unfolded_size
1606
1607#include "undef.F90"
1608#include "real.F90"
1609#include "accel_inc.F90"
1610
1611#include "undef.F90"
1612#include "complex.F90"
1613#include "accel_inc.F90"
1614
1615#include "undef.F90"
1616#include "integer.F90"
1617#include "accel_inc.F90"
1618
1619#include "undef.F90"
1620#include "integer8.F90"
1621#include "accel_inc.F90"
1622
1623end module accel_oct_m
1624
1625!! Local Variables:
1626!! mode: f90
1627!! coding: utf-8
1628!! End:
subroutine device_info()
Definition: accel.F90:650
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:841
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5871
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3658
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1534
subroutine accel_kernel_global_end()
Definition: accel.F90:1350
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2830
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1547
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5671
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5426
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3894
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:412
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2472
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3480
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2190
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3567
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5450
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:283
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3703
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1948
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5021
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1920
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5758
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3313
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3145
integer function, public accel_kernel_block_size(kernel)
Definition: accel.F90:1188
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2727
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2109
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4775
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3204
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5782
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4110
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:280
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3109
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:791
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5266
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3235
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3875
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3730
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5099
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1005
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1759
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4033
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1413
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3925
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2880
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4825
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4515
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2976
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1476
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:936
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3362
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4003
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1432
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2164
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4971
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:282
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2090
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1704
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3551
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2585
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4330
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2563
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:906
subroutine accel_check_bandwidth()
Definition: accel.F90:1278
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4138
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2419
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:5336
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1645
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4798
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3343
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4730
subroutine, public accel_finish()
Definition: accel.F90:1098
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5234
subroutine accel_kernel_global_init()
Definition: accel.F90:1337
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3420
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2218
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
Definition: accel.F90:1170
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2780
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4662
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5705
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:888
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5894
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:1066
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1201
subroutine, public accel_detach_buffer(this)
Clear a buffer handle without freeing device memory.
Definition: accel.F90:1049
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:5742
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3975
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1685
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2248
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3185
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4170
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2140
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4918
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5504
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2496
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
Definition: accel.F90:1132
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4476
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2938
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2957
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1242
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3635
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1090
integer, parameter, public accel_mem_read_write
Definition: accel.F90:185
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2608
subroutine accel_kernel_end(this)
Definition: accel.F90:1395
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:281
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5206
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3075
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3591
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1735
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5572
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1504
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3822
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4686
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1447
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3259
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4240
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2050
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4457
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:811
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4575
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5045
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5129
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2799
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5534
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5849
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4708
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2854
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3015
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2456
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4280
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:858
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1371
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:422
subroutine, public accel_end(namespace)
Definition: accel.F90:707
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4931
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1813
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3285
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1492
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1459
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5395
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2635
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4543
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4354
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5826
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4438
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:826
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4204
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:927
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1258
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2014
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:284
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5148
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4753
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4071
pure logical function, public accel_is_enabled()
Definition: accel.F90:402
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3514
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1843
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3949
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1881
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4052
integer, parameter, public accel_mem_write_only
Definition: accel.F90:185
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3043
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2325
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5300
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5553
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2267
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5639
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1785
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2540
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5921
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4299
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5071
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4990
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1980
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5167
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3613
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1519
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4609
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5611
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1632
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1862
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2740
type(accel_t), public accel
Definition: accel.F90:250
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5804
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4408
integer(int64) pure function, public accel_shared_memory_size()
Definition: accel.F90:1440
integer pure function, public accel_max_block_size()
Definition: accel.F90:1182
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4646
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2286
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4380
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3835
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3680
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:949
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5376
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2385
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2353
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1110
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5476
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3381
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2908
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2518
type(accel_kernel_t), pointer head
Definition: accel.F90:396
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3448
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
type(type_t), public type_none
Definition: types.F90:134
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)