Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
88
89 integer, public, parameter :: &
90 ACCEL_MEM_READ_ONLY = 0, &
93
95 ! Components are public by default
96#if defined(HAVE_CUDA)
97 type(c_ptr) :: cuda_context
98#else
99 integer :: dummy
100#endif
101 end type accel_context_t
102
103 type accel_device_t
104 ! Components are public by default
105#if defined(HAVE_CUDA)
106 type(c_ptr) :: cuda_device
107#else
108 integer :: dummy
109#endif
110 end type accel_device_t
111
112 type accel_t
113 ! Components are public by default
114 type(accel_context_t) :: context
115 type(accel_device_t) :: device
116 type(c_ptr) :: cublas_handle
117 type(c_ptr) :: cuda_stream
118 type(c_ptr) :: module_map
119 integer :: max_block_size
120 integer(int64) :: shared_memory_size
121 integer(int64) :: global_memory_size
122 logical :: enabled
123 logical :: allow_CPU_only
124 logical :: cuda_mpi
125 integer :: warp_size
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
130 end type accel_t
131
132 type accel_mem_t
133 ! Components are public by default
134 type(c_ptr) :: mem
135 integer(c_size_t) :: size = 0
136 type(type_t) :: type
137 integer :: flags = 0
138 logical :: allocated = .false.
139 end type accel_mem_t
140
141 type accel_kernel_t
142 ! Components are public by default
143#ifdef HAVE_CUDA
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
147#endif
148 logical :: initialized = .false.
149 type(accel_kernel_t), pointer :: next
150 integer :: arg_count
151 character(len=128) :: kernel_name
152 end type accel_kernel_t
153
154 type(accel_t), public :: accel
155
156 ! Global variables defined on device
157 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
158 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
159
160 ! the kernels
161 type(accel_kernel_t), public, target, save :: kernel_vpsi
162 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
163 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
164 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
165 type(accel_kernel_t), public, target, save :: kernel_daxpy
166 type(accel_kernel_t), public, target, save :: kernel_zaxpy
167 type(accel_kernel_t), public, target, save :: kernel_copy
168 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
169 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
170 type(accel_kernel_t), public, target, save :: dpack
171 type(accel_kernel_t), public, target, save :: zpack
172 type(accel_kernel_t), public, target, save :: dunpack
173 type(accel_kernel_t), public, target, save :: zunpack
174 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
175 type(accel_kernel_t), public, target, save :: kernel_density_real
176 type(accel_kernel_t), public, target, save :: kernel_density_complex
177 type(accel_kernel_t), public, target, save :: kernel_density_spinors
178 type(accel_kernel_t), public, target, save :: kernel_phase
179 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
180 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
181 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
182 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
183 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
184 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
185 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
186 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
187 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
188 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
189 type(accel_kernel_t), public, target, save :: dzmul
190 type(accel_kernel_t), public, target, save :: zzmul
191
192 interface accel_grid_size
194 end interface accel_grid_size
195
199
200 interface accel_padded_size
202 end interface accel_padded_size
204 interface accel_create_buffer
206 end interface accel_create_buffer
208 interface accel_kernel_run
210 end interface accel_kernel_run
239 end interface accel_write_buffer
240
241 interface accel_read_buffer
256 end interface accel_read_buffer
259 module procedure &
283 module procedure &
289
291 module procedure &
297
298 integer :: buffer_alloc_count
299 integer(int64) :: allocated_mem
300 type(accel_kernel_t), pointer :: head
301 type(alloc_cache_t) :: memcache
302
303contains
304
305 pure logical function accel_is_enabled() result(enabled)
306#ifdef HAVE_ACCEL
307 enabled = accel%enabled
308#else
309 enabled = .false.
310#endif
311 end function accel_is_enabled
312
313 ! ------------------------------------------
314
315 pure logical function accel_allow_cpu_only() result(allow)
316#ifdef HAVE_ACCEL
317 allow = accel%allow_CPU_only
318#else
319 allow = .true.
320#endif
321 end function accel_allow_cpu_only
322
323 ! ------------------------------------------
324
325 subroutine accel_init(base_grp, namespace)
326 type(mpi_grp_t), intent(inout) :: base_grp
327 type(namespace_t), intent(in) :: namespace
328
329 logical :: disable, default, run_benchmark
330 integer :: idevice
331#ifdef HAVE_CUDA
332 integer :: dim
333#ifdef HAVE_MPI
334 character(len=256) :: sys_name
335#endif
336#endif
337
338 push_sub(accel_init)
339
340 buffer_alloc_count = 0
341
342 !%Variable DisableAccel
343 !%Type logical
344 !%Default yes
345 !%Section Execution::Accel
346 !%Description
347 !% If Octopus was compiled with CUDA support, it will
348 !% try to initialize and use an accelerator device. By setting this
349 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
350 !%End
351 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
352#ifdef HAVE_ACCEL
353 default = .false.
354#else
355 default = .true.
356#endif
357 call parse_variable(namespace, 'DisableAccel', default, disable)
358 accel%enabled = .not. disable
359
360#ifndef HAVE_ACCEL
361 if (accel%enabled) then
362 message(1) = 'Octopus was compiled without Cuda support.'
363 call messages_fatal(1)
364 end if
365#endif
366
367 if (.not. accel_is_enabled()) then
368 pop_sub(accel_init)
369 return
370 end if
371
372 call messages_obsolete_variable(namespace, 'AccelPlatform')
373 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
374
375 !%Variable AccelDevice
376 !%Type integer
377 !%Default 0
378 !%Section Execution::Accel
379 !%Description
380 !% This variable selects the GPU that Octopus will use. You can specify a
381 !% numerical id to select a specific device.
382 !%
383 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
384 !% starting at this value.
385 !%End
386 call parse_variable(namespace, 'AccelDevice', 0, idevice)
387
388 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
389
390 if (idevice < 0) then
391 call messages_write('Invalid AccelDevice')
392 call messages_fatal()
393 end if
395 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
397#ifdef HAVE_CUDA
398 if (idevice<0) idevice = 0
399 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
400 idevice, base_grp%rank)
401#ifdef HAVE_MPI
402 call loct_sysname(sys_name)
403 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
404 " on ", trim(sys_name)
405 call messages_info(1, all_nodes = .true.)
406#endif
407
408 call cublas_init(accel%cublas_handle, accel%cuda_stream)
409#endif
411
412 ! Get some device information that we will need later
413#ifdef HAVE_CUDA
414 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
415 call cuda_device_shared_memory(accel%device%cuda_device, accel%shared_memory_size)
416 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_block_size)
417 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
418 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
419 accel%max_block_dim(1) = int(dim, int64)
420 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
421 accel%max_block_dim(2) = int(dim, int64)
422 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
423 accel%max_block_dim(3) = int(dim, int64)
424 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
425 accel%max_grid_dim(1) = int(dim, int64)
426 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
427 accel%max_grid_dim(2) = int(dim, int64)
428 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
429 accel%max_grid_dim(3) = int(dim, int64)
430#endif
431
432 if (base_grp%is_root()) call device_info()
433
434 ! initialize the cache used to speed up allocations
435 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
436
437 ! now initialize the kernels
439
440#if defined(HAVE_HIP)
441 accel%debug_flag = "-g"
442#elif defined(HAVE_CUDA)
443 accel%debug_flag = "-lineinfo"
444#endif
445
446 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
447 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
448 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
449 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
450 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
451 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
452 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
453 flags = ' -DRTYPE_DOUBLE')
454 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
455 flags = '-DRTYPE_COMPLEX')
456 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
457 flags = '-DRTYPE_DOUBLE')
458 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
459 flags = '-DRTYPE_COMPLEX')
460 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
461 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
462 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
463 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
464 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
465 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
466 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
467 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
468 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
469 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
470 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
471 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
472 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
473 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
474 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
475 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
476 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
477
478
479 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
480 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
481
482 ! Define global buffers
483 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
484 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
485 call accel_write_buffer(zm_0_buffer, m_z0)
486 end if
487 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
488 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
489 call accel_write_buffer(zm_1_buffer, m_z1)
490 end if
491 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
492 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
493 call accel_write_buffer(dm_0_buffer, m_zero)
494 end if
495 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
496 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
497 call accel_write_buffer(dm_1_buffer, m_one)
498 end if
499
500
501 !%Variable AccelBenchmark
502 !%Type logical
503 !%Default no
504 !%Section Execution::Accel
505 !%Description
506 !% If this variable is set to yes, Octopus will run some
507 !% routines to benchmark the performance of the accelerator device.
508 !%End
509 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
510
511 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
512
513 if (run_benchmark) then
515 end if
516
517 !%Variable GPUAwareMPI
518 !%Type logical
519 !%Section Execution::Accel
520 !%Description
521 !% If Octopus was compiled with GPU support and MPI support and if the MPI
522 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
523 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
524 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
525 !% going through the host memory.
526 !% The default is false, except when the configure switch --enable-cudampi is set, in which
527 !% case this variable is set to true.
528 !%End
529#ifdef HAVE_CUDA_MPI
530 default = .true.
531#else
532 default = .false.
533#endif
534 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
535 if (accel%cuda_mpi) then
536#ifndef HAVE_CUDA_MPI
537 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
538 call messages_warning()
539#endif
540 call messages_write("Using GPU-aware MPI.")
541 call messages_info()
542 end if
543
544
545 !%Variable AllowCPUonly
546 !%Type logical
547 !%Section Execution::Accel
548 !%Description
549 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
550 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
551 !% code execution also in these cases.
552 !%End
553#if defined (HAVE_ACCEL)
554 default = .false.
555#else
556 default = .true.
557#endif
558 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
559
560
561 !%Variable InitializeGPUBuffers
562 !%Type integer
563 !%Default no
564 !%Section Execution::Accel
565 !%Description
566 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
567 !%Option no 0
568 !% Do not initialize GPU buffers.
569 !%Option yes 1
570 !% Initialize GPU buffers to zero.
571 !%Option nan 2
572 !% Initialize GPU buffers to nan.
573 !%End
574 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
575 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
576 call messages_input_error(namespace, 'InitializeGPUBuffers')
577 end if
578
579
580 call messages_print_with_emphasis(namespace=namespace)
581
582 pop_sub(accel_init)
583
584 contains
585
586 subroutine device_info()
587#ifdef HAVE_CUDA
588 integer :: version
589 character(kind=c_char) :: cval_str(257)
590#endif
591 integer :: major, minor
592 character(len=256) :: val_str
593
594 push_sub(accel_init.device_info)
595
596 call messages_new_line()
597 call messages_write('Selected device:')
598 call messages_new_line()
599
600#ifdef HAVE_CUDA
601#ifdef __HIP_PLATFORM_AMD__
602 call messages_write(' Framework : ROCm')
603#else
604 call messages_write(' Framework : CUDA')
605#endif
606#endif
607 call messages_info()
608
609#ifdef HAVE_CUDA
610 call messages_write(' Device type : GPU', new_line = .true.)
611#ifdef __HIP_PLATFORM_AMD__
612 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
613#else
614 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
615#endif
616#endif
617
618#ifdef HAVE_CUDA
619 cval_str = c_null_char
620 call cuda_device_name(accel%device%cuda_device, cval_str)
621 call string_c_to_f(cval_str, val_str)
622#endif
623 call messages_write(' Device name : '//trim(val_str))
624 call messages_new_line()
625
626#ifdef HAVE_CUDA
627 call cuda_device_capability(accel%device%cuda_device, major, minor)
628#endif
629 call messages_write(' Cuda capabilities :')
630 call messages_write(major, fmt = '(i2)')
631 call messages_write('.')
632 call messages_write(minor, fmt = '(i1)')
633 call messages_new_line()
634
635 ! VERSION
636#ifdef HAVE_CUDA
637 call cuda_driver_version(version)
638 call messages_write(' Driver version : ')
639 call messages_write(version)
640#endif
641 call messages_new_line()
642
643
644 call messages_write(' Device memory :')
645 call messages_write(accel%global_memory_size, units=unit_megabytes)
646 call messages_new_line()
647
648 call messages_write(' Shared memory :')
649 call messages_write(accel%shared_memory_size, units=unit_kilobytes)
650 call messages_new_line()
651
652 call messages_write(' Max. block size :')
653 call messages_write(accel%max_block_size)
654 call messages_new_line()
655
656 call messages_info()
657
658 pop_sub(accel_init.device_info)
659 end subroutine device_info
660
661 end subroutine accel_init
662
663 ! ------------------------------------------
664 subroutine accel_end(namespace)
665 type(namespace_t), intent(in) :: namespace
666
667 integer(int64) :: hits, misses
668 real(real64) :: volume_hits, volume_misses
669 logical :: found
670 type(accel_mem_t) :: tmp
671
672 push_sub(accel_end)
673
674 if (accel_is_enabled()) then
675
676 ! Release global buffers
677 call accel_free_buffer(zm_0_buffer)
678 call accel_free_buffer(zm_1_buffer)
679 call accel_free_buffer(dm_0_buffer)
680 call accel_free_buffer(dm_1_buffer)
681
682 do
683 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
684 if (.not. found) exit
685
686#ifdef HAVE_CUDA
687 call cuda_mem_free(tmp%mem)
688#endif
689 end do
690
691 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
692
693 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
694
695 call messages_new_line()
696 call messages_write(' Number of allocations =')
697 call messages_write(hits + misses, new_line = .true.)
698 call messages_write(' Volume of allocations =')
699 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
700 new_line = .true.)
701 call messages_write(' Hit ratio =')
702 if (hits + misses > 0) then
703 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
704 else
705 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
706 end if
707 call messages_write('%', new_line = .true.)
708 call messages_write(' Volume hit ratio =')
709 if (volume_hits + volume_misses > 0) then
710 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
711 else
712 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
713 end if
714 call messages_write('%')
715 call messages_new_line()
716 call messages_info()
717
718 call messages_print_with_emphasis(namespace=namespace)
719 end if
720
722
723 if (accel_is_enabled()) then
724#ifdef HAVE_CUDA
725 call cublas_end(accel%cublas_handle)
726 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
727 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
728 end if
729#endif
730
731 if (buffer_alloc_count /= 0) then
732 call messages_write('Accel:')
733 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
734 call messages_write(' in ')
735 call messages_write(buffer_alloc_count)
736 call messages_write(' buffers were not deallocated.')
737 call messages_fatal()
738 end if
739
740 end if
741
742 pop_sub(accel_end)
743 end subroutine accel_end
744
745 ! ------------------------------------------
746
748 subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
749 integer(int64), intent(in) :: n(:)
750 integer(int64), intent(in) :: blocksizes(:)
751 integer(int64), intent(out) :: gridsizes(:)
752
753 integer :: dim, i
754
755 dim = ubound(n, dim=1)
756 assert(dim == ubound(blocksizes, dim=1))
757 assert(dim == ubound(gridsizes, dim=1))
758
759 do i = 1, dim
760 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
761 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
762 end do
763 end subroutine accel_grid_size_array_i8
764
765 ! ------------------------------------------
766
768 subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
769 integer, intent(in) :: n(:)
770 integer, intent(in) :: blocksizes(:)
771 integer, intent(out) :: gridsizes(:)
772
773 integer(int64) :: gridsizes64(size(gridsizes))
774
775 call accel_grid_size_array_i8(int(n(:), int64), int(blocksizes(:), int64), gridsizes64)
776
777 gridsizes = int(gridsizes64, int32)
778 end subroutine accel_grid_size_array_i4
779
780 ! ------------------------------------------
781
783 subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
784 integer(int64), intent(in) :: n
785 integer(int64), intent(in) :: blocksizes
786 integer(int64), intent(out) :: gridsizes
787
788 integer(int64) :: temp(1)
790 call accel_grid_size_array_i8( (/n/), (/blocksizes/), temp)
791
792 gridsizes = temp(1)
793 end subroutine accel_grid_size_i8
794
795 ! ------------------------------------------
796
798 subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
799 integer, intent(in) :: n
800 integer, intent(in) :: blocksizes
801 integer, intent(out) :: gridsizes
802
803 integer(int64) :: temp(1)
804
805 call accel_grid_size_array_i8(int((/n/), int64), int((/blocksizes/), int64), temp)
806
807 gridsizes = int(temp(1), int32)
808 end subroutine accel_grid_size_i4
810! ------------------------------------------
811
815 subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
816 integer(int64), intent(in) :: n
817 integer(int64), intent(in) :: pack_size
818 integer(int64), dimension(3), intent(out) :: gridsizes
819 integer(int64), dimension(3), intent(out) :: blocksizes
820 type(accel_kernel_t), optional, intent(inout) :: kernel
821
822 integer(int64) :: bsize, dim2, dim3
823 integer(int64), dimension(3) :: nn
825 if(present(kernel)) then
826 bsize = accel_kernel_block_size(kernel)/pack_size
827 else
828 bsize = accel_max_block_size()/pack_size
829 end if
830
831 dim3 = n/(accel_max_size_per_dim(2)*bsize) + 1
832 dim2 = min(accel_max_size_per_dim(2)*bsize, pad(n, bsize))
833
834 nn = (/pack_size, dim2, dim3/)
835 blocksizes = (/pack_size, bsize, 1_int64/)
836
837 call accel_grid_size(nn, blocksizes, gridsizes)
838 end subroutine accel_grid_size_extend_dim_i8
840 ! ------------------------------------------
841
845 subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
846 integer, intent(in) :: n
847 integer, intent(in) :: pack_size
848 integer, dimension(3), intent(out) :: gridsizes
849 integer, dimension(3), intent(out) :: blocksizes
850 type(accel_kernel_t), optional, intent(inout) :: kernel
851
852 integer(int64) :: gridsizes64(3), blocksizes64(3)
853
854 call accel_grid_size_extend_dim_i8(int(n, int64), int(pack_size, int64), &
855 gridsizes64, blocksizes64, kernel=kernel)
857 gridsizes = int(gridsizes64, int32)
858 blocksizes = int(blocksizes64, int32)
859 end subroutine accel_grid_size_extend_dim_i4
860
861 ! ------------------------------------------
862
863 integer(int64) function accel_padded_size_i8(nn) result(psize)
864 integer(int64), intent(in) :: nn
865
866 integer(int64) :: modnn, bsize
867
868 psize = nn
869
870 if (accel_is_enabled()) then
871
872 bsize = accel_max_block_size()
873
874 psize = nn
875 modnn = mod(nn, bsize)
876 if (modnn /= 0) psize = psize + bsize - modnn
877
878 end if
879
880 end function accel_padded_size_i8
881
882 ! ------------------------------------------
883
884 integer(int32) function accel_padded_size_i4(nn) result(psize)
885 integer(int32), intent(in) :: nn
887 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
888
889 end function accel_padded_size_i4
890
891 ! ------------------------------------------
892
893 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
894 type(accel_mem_t), intent(inout) :: this
895 integer, intent(in) :: flags
896 type(type_t), intent(in) :: type
897 integer, intent(in) :: size
898 logical, optional, intent(in) :: set_zero
899 logical, optional, intent(in) :: async
900
901 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
902 end subroutine accel_create_buffer_4
903
904 ! ------------------------------------------
905
906 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
907 type(accel_mem_t), intent(inout) :: this
908 integer, intent(in) :: flags
909 type(type_t), intent(in) :: type
910 integer(int64), intent(in) :: size
911 logical, optional, intent(in) :: set_zero
912 logical, optional, intent(in) :: async
913
914 integer(int64) :: fsize
915 logical :: found
916 integer(int64) :: initialize_buffers
917
918 push_sub(accel_create_buffer_8)
919
920 this%type = type
921 this%size = size
922 this%flags = flags
923 fsize = int(size, int64)*types_get_size(type)
924 this%allocated = .true.
926 if (fsize > 0) then
927
928 call alloc_cache_get(memcache, fsize, found, this%mem)
929
930 if (.not. found) then
931#ifdef HAVE_CUDA
932 if(optional_default(async, .false.)) then
933 call cuda_mem_alloc_async(this%mem, fsize)
934 else
935 call cuda_mem_alloc(this%mem, fsize)
936 end if
937#endif
938 end if
939
940 buffer_alloc_count = buffer_alloc_count + 1
941 allocated_mem = allocated_mem + fsize
942
943 end if
944
945 if (present(set_zero)) then
946 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
947 else
948 initialize_buffers = accel%initialize_buffers
949 end if
950 select case (initialize_buffers)
951 case (option__initializegpubuffers__yes)
952 call accel_set_buffer_to(this, type, int(z'00', int8), size)
953 case (option__initializegpubuffers__nan)
954 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
955 end select
956
957 pop_sub(accel_create_buffer_8)
958 end subroutine accel_create_buffer_8
959
960 ! ------------------------------------------
961
962 subroutine accel_free_buffer(this, async)
963 type(accel_mem_t), intent(inout) :: this
964 logical, optional, intent(in) :: async
965
966 logical :: put
967 integer(int64) :: fsize
968
969 push_sub(accel_free_buffer)
970
971 if (this%size > 0) then
972
973 fsize = int(this%size, int64)*types_get_size(this%type)
974
975 call alloc_cache_put(memcache, fsize, this%mem, put)
976
977 if (.not. put) then
978#ifdef HAVE_CUDA
979 if (optional_default(async, .false.)) then
980 call cuda_mem_free_async(this%mem)
981 else
982 call cuda_mem_free(this%mem)
983 end if
984#endif
985 end if
986
987 buffer_alloc_count = buffer_alloc_count - 1
988 allocated_mem = allocated_mem + fsize
989
990 end if
991
992 this%size = 0
993 this%flags = 0
994
995 this%allocated = .false.
996
997 pop_sub(accel_free_buffer)
998 end subroutine accel_free_buffer
999
1000 ! ------------------------------------------------------
1001
1002 ! Check if the temporary buffers are the right size, if not reallocate them
1003 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1004 type(accel_mem_t), intent(inout) :: buffer
1005 integer, intent(in) :: flags
1006 type(type_t), intent(in) :: type
1007 integer, intent(in) :: required_size
1008 logical, intent(in) :: set_zero
1009 logical, optional, intent(in) :: async
1010
1011 push_sub(accel_ensure_buffer_size)
1012
1013
1014 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
1015 call accel_free_buffer(buffer, async=optional_default(async, .false.))
1016 end if
1017
1018 if (.not. accel_buffer_is_allocated(buffer)) then
1019 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1020 end if
1021
1023 end subroutine accel_ensure_buffer_size
1024
1025 ! ------------------------------------------
1026
1027 logical pure function accel_buffer_is_allocated(this) result(allocated)
1028 type(accel_mem_t), intent(in) :: this
1029
1030 allocated = this%allocated
1031 end function accel_buffer_is_allocated
1032
1033 ! -----------------------------------------
1034
1035 subroutine accel_finish()
1036 ! no push_sub, called too frequently
1037
1038 if (accel_is_enabled()) then
1039#ifdef HAVE_CUDA
1041#endif
1042 end if
1043 end subroutine accel_finish
1045 ! ------------------------------------------
1046
1047 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1048 type(accel_kernel_t), intent(inout) :: kernel
1049 integer, intent(in) :: narg
1050 type(accel_mem_t), intent(in) :: buffer
1051
1052 assert(accel_buffer_is_allocated(buffer))
1053
1054 ! no push_sub, called too frequently
1055#ifdef HAVE_CUDA
1056 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1057#endif
1058
1059 end subroutine accel_set_kernel_arg_buffer
1060
1061 ! ------------------------------------------
1062
1069 subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
1070 type(accel_kernel_t), intent(inout) :: kernel
1071 integer(int64), intent(in) :: gridsizes(:)
1072 integer(int64), intent(in) :: blocksizes(:)
1073 integer(int64), optional, intent(in) :: shared_memory_size
1074
1075 integer :: dim
1076 integer(int64) :: gsizes(1:3)
1077 integer(int64) :: bsizes(1:3)
1078
1079 ! no push_sub, called too frequently
1080
1081 ! CUDA needs all dimensions
1082 gsizes = 1
1083 bsizes = 1
1084
1085 dim = ubound(gridsizes, dim=1)
1086
1087 assert(dim == ubound(blocksizes, dim=1))
1089 ! if one size is zero, there is nothing to do
1090 if (any(gridsizes == 0)) return
1091
1092 assert(all(blocksizes > 0))
1093
1094 gsizes(1:dim) = gridsizes(1:dim)
1095 bsizes(1:dim) = blocksizes(1:dim)
1096
1097#ifdef HAVE_CUDA
1098 ! Maximum dimension of a block
1099 if (any(bsizes(1:3) > accel%max_block_dim(1:3))) then
1100 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1101 message(2) = "The following conditions should be fulfilled:"
1102 write(message(3), "(A, I8, A, I8)") "Dim 1: ", bsizes(1), " <= ", accel%max_block_dim(1)
1103 write(message(4), "(A, I8, A, I8)") "Dim 2: ", bsizes(2), " <= ", accel%max_block_dim(2)
1104 write(message(5), "(A, I8, A, I8)") "Dim 3: ", bsizes(3), " <= ", accel%max_block_dim(3)
1105 message(6) = "This is an internal error, please contact the developers."
1106 call messages_fatal(6)
1107 end if
1108
1109
1110 ! Maximum number of threads per block
1111 if (product(bsizes) > accel_max_block_size()) then
1112 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1113 message(2) = "The following condition should be fulfilled:"
1114 write(message(3), "(I8, A, I8)") product(bsizes), " <= ", accel_max_block_size()
1115 message(4) = "This is an internal error, please contact the developers."
1116 call messages_fatal(4)
1117 end if
1118
1119 ! Maximum dimensions of the grid of thread block
1120 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1121 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1122 message(2) = "The following conditions should be fulfilled:"
1123 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1124 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1125 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1126 message(6) = "This is an internal error, please contact the developers."
1127 call messages_fatal(6)
1128 end if
1129
1130 if(present(shared_memory_size)) then
1131
1132 if (shared_memory_size > accel%shared_memory_size) then
1133 message(1) = "Shared memory too large in kernel "//trim(kernel%kernel_name)
1134 message(2) = "The following condition should be fulfilled:"
1135 message(3) = "Requested shared memory <= Available shared memory"
1136 write(message(4), '(a,f12.6,a)') "Requested shared memory: ", real(shared_memory_size, real64) /1024.0, " Kb"
1137 write(message(5), '(a,f12.6,a)') "Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0, " Kb"
1138 message(6) = "This is an internal error, please contact the developers."
1139 call messages_fatal(6)
1140 else if (shared_memory_size <= 0) then
1141 message(1) = "Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1142 write(message(2), '(a,f12.6,a)') "Shared memory size requested: ", real(shared_memory_size, real64) /1024.0, " Kb"
1143 message(3) = "This is an internal error, please contact the developers."
1144 call messages_fatal(3)
1145 end if
1146 end if
1147
1148 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), bsizes(1), &
1149 optional_default(shared_memory_size, 0_int64), kernel%arguments)
1150#endif
1151
1152 end subroutine accel_kernel_run_8
1153
1154 ! -----------------------------------------------
1155
1161
1162 subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
1163 type(accel_kernel_t), intent(inout) :: kernel
1164 integer, intent(in) :: gridsizes(:)
1165 integer, intent(in) :: blocksizes(:)
1166 integer(int64), optional, intent(in) :: shared_memory_size
1167
1168 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1169
1170 end subroutine accel_kernel_run_4
1171
1172 ! -----------------------------------------------
1173
1174 integer pure function accel_max_block_size() result(max_block_size)
1175 max_block_size = accel%max_block_size
1176 end function accel_max_block_size
1177
1178 ! -----------------------------------------------
1180 integer function accel_kernel_block_size(kernel) result(block_size)
1181 type(accel_kernel_t), intent(inout) :: kernel
1182
1183#ifdef HAVE_CUDA
1184 integer :: max_block_size
1185#endif
1186
1187 block_size = 0
1188
1189#ifdef HAVE_CUDA
1190 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1191 if (debug%info .and. max_block_size /= accel%max_block_size) then
1192 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", max_block_size, ")", &
1193 "than available on the device (", accel%max_block_size, ")"
1194 call messages_info(1)
1195 end if
1196
1197 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1198 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1199 block_size = 256
1200
1201 ! make sure we do not use more threads per block than available for this kernel
1202 block_size = min(block_size, max_block_size)
1203#endif
1204
1205 end function accel_kernel_block_size
1206
1207 ! ----------------------------------------------------
1208
1209 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1210 type(accel_mem_t), intent(inout) :: buffer
1211 type(type_t), intent(in) :: type
1212 integer(int8), intent(in) :: val
1213 integer(int64), intent(in) :: nval
1214 integer(int64), optional, intent(in) :: offset
1215 logical, optional, intent(in) :: async
1216
1217 integer(int64) :: nval_, offset_, type_size
1218
1219 push_sub(accel_set_buffer_to)
1221 if (nval == 0) then
1222 pop_sub(accel_set_buffer_to)
1223 return
1224 end if
1225 assert(nval > 0)
1226
1227 if (present(offset)) then
1228 assert(offset >= 0)
1229 if(offset > buffer%size) then
1230 pop_sub(accel_set_buffer_to)
1231 return
1232 end if
1233 end if
1234
1235 type_size = types_get_size(type)
1237 nval_ = nval*type_size
1238
1239 offset_ = 0_int64
1240 if (present(offset)) offset_ = offset*type_size
1241
1242 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1243 if(.not. optional_default(async, .false.)) call accel_finish()
1244
1245 pop_sub(accel_set_buffer_to)
1246 end subroutine accel_set_buffer_to
1247
1248 ! ----------------------------------------------------
1249
1250 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1251 type(accel_mem_t), intent(inout) :: buffer
1252 type(type_t), intent(in) :: type
1253 integer(int64), intent(in) :: nval
1254 integer(int64), optional, intent(in) :: offset
1255 logical, optional, intent(in) :: async
1258
1259 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1260
1262 end subroutine accel_set_buffer_to_zero_i8
1263
1264 ! ----------------------------------------------------
1265
1266 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1267 type(accel_mem_t), intent(inout) :: buffer
1268 type(type_t), intent(in) :: type
1269 integer(int32), intent(in) :: nval
1270 integer(int32), optional, intent(in) :: offset
1271 logical, optional, intent(in) :: async
1272
1274
1275 if (present(offset)) then
1276 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1277 else
1278 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1279 end if
1280
1282 end subroutine accel_set_buffer_to_zero_i4
1283
1284 ! ----------------------------------------------------
1285
1286 subroutine accel_check_bandwidth()
1287 integer :: itime
1288 integer, parameter :: times = 10
1289 integer :: size
1290 real(real64) :: time, stime
1291 real(real64) :: read_bw, write_bw
1292 type(accel_mem_t) :: buff
1293 real(real64), allocatable :: data(:)
1294
1295 call messages_new_line()
1296 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1297 call messages_new_line()
1298 call messages_info()
1299
1300 call messages_write(' Buffer size Read bw Write bw')
1301 call messages_new_line()
1302 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1303 call messages_info()
1304
1305 size = 15000
1306 do
1307 safe_allocate(data(1:size))
1308 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1309
1310 stime = loct_clock()
1311 do itime = 1, times
1312 call accel_write_buffer(buff, size, data)
1313 call accel_finish()
1314 end do
1315 time = (loct_clock() - stime)/real(times, real64)
1316
1317 write_bw = real(size, real64) *8.0_real64/time
1318
1319 stime = loct_clock()
1320 do itime = 1, times
1321 call accel_read_buffer(buff, size, data)
1322 end do
1323 call accel_finish()
1324
1325 time = (loct_clock() - stime)/real(times, real64)
1326 read_bw = real(size, real64) *8.0_real64/time
1327
1328 call messages_write(size*8.0_real64/1024.0_real64**2)
1329 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1330 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1331 call messages_info()
1332
1333 call accel_free_buffer(buff)
1334
1335 safe_deallocate_a(data)
1336
1337 size = int(size*2.0)
1338
1339 if (size > 50000000) exit
1340 end do
1341 end subroutine accel_check_bandwidth
1342
1343 !------------------------------------------------------------
1344
1345 subroutine accel_kernel_global_init()
1346
1347 push_sub(accel_kernel_global_init)
1348
1349 nullify(head)
1350
1351 call cuda_module_map_init(accel%module_map)
1352
1354 end subroutine accel_kernel_global_init
1355
1356 !------------------------------------------------------------
1357
1358 subroutine accel_kernel_global_end()
1359 type(accel_kernel_t), pointer :: next_head
1360
1361 push_sub(accel_kernel_global_end)
1362
1363 do
1364 if (.not. associated(head)) exit
1365 next_head => head%next
1367 head => next_head
1368 end do
1369
1370 if (accel_is_enabled()) then
1371 call cuda_module_map_end(accel%module_map)
1372 end if
1375 end subroutine accel_kernel_global_end
1376
1377 !------------------------------------------------------------
1378
1379 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1380 type(accel_kernel_t), intent(inout) :: this
1381 character(len=*), intent(in) :: file_name
1382 character(len=*), intent(in) :: kernel_name
1383 character(len=*), optional, intent(in) :: flags
1384
1385#ifdef HAVE_CUDA
1386 character(len=1000) :: all_flags
1387#endif
1388
1389 push_sub(accel_kernel_build)
1390
1391 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1392
1393#ifdef HAVE_CUDA
1394 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1395
1396 if (present(flags)) then
1397 all_flags = trim(all_flags)//' '//trim(flags)
1398 end if
1399
1400 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1401 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1402
1403 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1404 call cuda_alloc_arg_array(this%arguments)
1405#endif
1406
1407 this%initialized = .true.
1408 this%kernel_name = trim(kernel_name)
1409
1410 call profiling_out("ACCEL_COMPILE")
1411
1412 pop_sub(accel_kernel_build)
1413 end subroutine accel_kernel_build
1414
1415 !------------------------------------------------------------
1416
1417 subroutine accel_kernel_end(this)
1418 type(accel_kernel_t), intent(inout) :: this
1419
1420 push_sub(accel_kernel_end)
1421
1422#ifdef HAVE_CUDA
1423 call cuda_free_arg_array(this%arguments)
1424 call cuda_release_kernel(this%cuda_kernel)
1425 ! modules are not released here, since they are not associated to a kernel
1426#endif
1427
1428 this%initialized = .false.
1429
1430 pop_sub(accel_kernel_end)
1431 end subroutine accel_kernel_end
1432
1433 !------------------------------------------------------------
1434
1435 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1436 type(accel_kernel_t), target, intent(inout) :: this
1437 character(len=*), intent(in) :: file_name
1438 character(len=*), intent(in) :: kernel_name
1439 character(len=*), optional, intent(in) :: flags
1440
1441 push_sub(accel_kernel_start_call)
1442
1443 if (.not. this%initialized) then
1444 call accel_kernel_build(this, file_name, kernel_name, flags)
1445 this%next => head
1446 head => this
1447 end if
1448
1450 end subroutine accel_kernel_start_call
1451
1452 !--------------------------------------------------------------
1453
1454 integer(int64) pure function accel_global_memory_size() result(size)
1455
1456 size = accel%global_memory_size
1457
1458 end function accel_global_memory_size
1459
1460 !--------------------------------------------------------------
1461
1462 integer(int64) pure function accel_shared_memory_size() result(size)
1463
1464 size = accel%shared_memory_size
1465
1466 end function accel_shared_memory_size
1467 !--------------------------------------------------------------
1468
1469 integer pure function accel_max_size_per_dim(dim) result(size)
1470 integer, intent(in) :: dim
1471
1472 size = 0
1473#ifdef HAVE_CUDA
1474 size = 32768
1475 if (dim == 1) size = 2**30
1476#endif
1477 end function accel_max_size_per_dim
1478
1479 ! ------------------------------------------------------
1480
1481 subroutine accel_set_stream(stream_number)
1482 integer, intent(in) :: stream_number
1483
1484 push_sub(accel_set_stream)
1485
1486 if (accel_is_enabled()) then
1487#ifdef HAVE_CUDA
1488 call cuda_set_stream(accel%cuda_stream, stream_number)
1489 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1490#endif
1491 end if
1492
1493 pop_sub(accel_set_stream)
1494 end subroutine accel_set_stream
1495
1496 ! ------------------------------------------------------
1498 subroutine accel_get_stream(stream_number)
1499 integer, intent(inout) :: stream_number
1500
1501 push_sub(accel_get_stream)
1502
1503 if (accel_is_enabled()) then
1504#ifdef HAVE_CUDA
1505 call cuda_get_stream(stream_number)
1506#endif
1507 end if
1508
1509 pop_sub(accel_get_stream)
1510 end subroutine accel_get_stream
1511
1512 ! ------------------------------------------------------
1513
1516
1517 if (accel_is_enabled()) then
1518#ifdef HAVE_CUDA
1519 call cuda_synchronize_all_streams()
1520#endif
1521 end if
1522
1524 end subroutine accel_synchronize_all_streams
1526 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1527 type(c_ptr), intent(in) :: buffer
1528 integer(int64), intent(in) :: offset
1529 type(c_ptr) :: buffer_offset
1530
1532#ifdef HAVE_CUDA
1533 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1534#else
1535 ! this is needed to make the compiler happy for non-GPU compilations
1536 buffer_offset = buffer
1537#endif
1540
1541 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1542 type(c_ptr), intent(in) :: buffer
1543 integer(int64), intent(in) :: offset
1544 type(c_ptr) :: buffer_offset
1545
1547#ifdef HAVE_CUDA
1548 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1549#else
1550 ! this is needed to make the compiler happy for non-GPU compilations
1551 buffer_offset = buffer
1552#endif
1555
1556 subroutine accel_clean_pointer(buffer)
1557 type(c_ptr), intent(in) :: buffer
1558
1559 push_sub(accel_clean_pointer)
1560#ifdef HAVE_CUDA
1561 call cuda_clean_pointer(buffer)
1562#endif
1563 pop_sub(accel_clean_pointer)
1564 end subroutine accel_clean_pointer
1565
1569 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1570 integer(int64), intent(in) :: size
1571 integer(int64), intent(out) :: grid_size
1572 integer(int64), intent(out) :: thread_block_size
1573
1574 push_sub(accel_get_unfolded_size)
1575#ifdef __HIP_PLATFORM_AMD__
1576 ! not benefitial for AMD chips
1577 grid_size = 1_int64
1578 thread_block_size = size
1579#else
1580 grid_size = size
1581 thread_block_size = accel%warp_size
1582#endif
1584 end subroutine accel_get_unfolded_size
1585
1586#include "undef.F90"
1587#include "real.F90"
1588#include "accel_inc.F90"
1589
1590#include "undef.F90"
1591#include "complex.F90"
1592#include "accel_inc.F90"
1593
1594#include "undef.F90"
1595#include "integer.F90"
1596#include "accel_inc.F90"
1597
1598#include "undef.F90"
1599#include "integer8.F90"
1600#include "accel_inc.F90"
1601
1602end module accel_oct_m
1603
1604!! Local Variables:
1605!! mode: f90
1606!! coding: utf-8
1607!! End:
subroutine device_info()
Definition: accel.F90:649
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:840
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5850
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3637
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1513
subroutine accel_kernel_global_end()
Definition: accel.F90:1329
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2809
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1526
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5650
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5405
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3873
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:411
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2451
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3459
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2169
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3546
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5429
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:282
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3682
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1927
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5000
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1899
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5737
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3292
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3124
integer function, public accel_kernel_block_size(kernel)
Definition: accel.F90:1167
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2706
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2088
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4754
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3183
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5761
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4089
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:279
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3088
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:790
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5245
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3214
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3854
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3709
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5078
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1004
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1738
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4012
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1392
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3904
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2859
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4804
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4494
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2955
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1455
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:935
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3341
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3982
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1411
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2143
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4950
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:281
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2069
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1683
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3530
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2564
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4309
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2542
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:905
subroutine accel_check_bandwidth()
Definition: accel.F90:1257
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4117
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2398
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:5315
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1624
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4777
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3322
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4709
subroutine, public accel_finish()
Definition: accel.F90:1077
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5213
subroutine accel_kernel_global_init()
Definition: accel.F90:1316
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3399
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2197
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
Definition: accel.F90:1149
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2759
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4641
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5684
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:887
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5873
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:1045
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1180
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:5721
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3954
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1664
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2227
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3164
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4149
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2119
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4897
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5483
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2475
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
Definition: accel.F90:1111
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4455
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2917
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2936
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1221
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3614
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1069
integer, parameter, public accel_mem_read_write
Definition: accel.F90:184
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2587
subroutine accel_kernel_end(this)
Definition: accel.F90:1374
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:280
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5185
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3054
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3570
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1714
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5551
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1483
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3801
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4665
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1426
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3238
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4219
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2029
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4436
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:810
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4554
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5024
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5108
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2778
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5513
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5828
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4687
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2833
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2994
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2435
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4259
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:857
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1350
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:421
subroutine, public accel_end(namespace)
Definition: accel.F90:706
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4910
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1792
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3264
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1471
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1438
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5374
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2614
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4522
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4333
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5805
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4417
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:825
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4183
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:926
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1237
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1993
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:283
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5127
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4732
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4050
pure logical function, public accel_is_enabled()
Definition: accel.F90:401
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3493
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1822
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3928
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1860
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4031
integer, parameter, public accel_mem_write_only
Definition: accel.F90:184
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3022
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2304
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5279
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5532
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2246
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5618
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1764
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2519
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5900
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4278
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5050
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4969
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1959
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5146
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3592
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1498
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4588
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5590
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1611
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1841
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2719
type(accel_t), public accel
Definition: accel.F90:249
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5783
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4387
integer(int64) pure function, public accel_shared_memory_size()
Definition: accel.F90:1419
integer pure function, public accel_max_block_size()
Definition: accel.F90:1161
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4625
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2265
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4359
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3814
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3659
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:948
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5355
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2364
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2332
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1089
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5455
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3360
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2887
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2497
type(accel_kernel_t), pointer head
Definition: accel.F90:395
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3427
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)