Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_null_ptr, c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
90
91 integer, public, parameter :: &
92 ACCEL_MEM_READ_ONLY = 0, &
95
97 ! Components are public by default
98#if defined(HAVE_CUDA)
99 type(c_ptr) :: cuda_context
100#else
101 integer :: dummy
102#endif
103 end type accel_context_t
104
105 type accel_device_t
106 ! Components are public by default
107#if defined(HAVE_CUDA)
108 type(c_ptr) :: cuda_device
109#else
110 integer :: dummy
111#endif
112 end type accel_device_t
113
114 type accel_t
115 ! Components are public by default
116 type(accel_context_t) :: context
117 type(accel_device_t) :: device
118 type(c_ptr) :: cublas_handle
119 type(c_ptr) :: cuda_stream
120 type(c_ptr) :: module_map
121 integer :: max_block_size
122 integer(int64) :: shared_memory_size
123 integer(int64) :: global_memory_size
124 logical :: enabled
125 logical :: allow_CPU_only
126 logical :: cuda_mpi
127 integer :: warp_size
128 integer(int64) :: initialize_buffers
129 character(len=32) :: debug_flag
130 integer(int64) :: max_block_dim(3)
131 integer(int64) :: max_grid_dim(3)
132 end type accel_t
133
134 type accel_mem_t
135 ! Components are public by default
136 type(c_ptr) :: mem
137 integer(c_size_t) :: size = 0
138 type(type_t) :: type
139 integer :: flags = 0
140 logical :: allocated = .false.
141 end type accel_mem_t
142
143 type accel_kernel_t
144 ! Components are public by default
145#ifdef HAVE_CUDA
146 type(c_ptr) :: cuda_kernel
147 type(c_ptr) :: cuda_module
148 type(c_ptr) :: arguments
149#endif
150 logical :: initialized = .false.
151 type(accel_kernel_t), pointer :: next
152 integer :: arg_count
153 character(len=128) :: kernel_name
154 end type accel_kernel_t
155
156 type(accel_t), public :: accel
157
158 ! Global variables defined on device
159 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
160 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
161
162 ! the kernels
163 type(accel_kernel_t), public, target, save :: kernel_vpsi
164 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
165 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
166 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
167 type(accel_kernel_t), public, target, save :: kernel_daxpy
168 type(accel_kernel_t), public, target, save :: kernel_zaxpy
169 type(accel_kernel_t), public, target, save :: kernel_copy
170 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
171 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
172 type(accel_kernel_t), public, target, save :: dpack
173 type(accel_kernel_t), public, target, save :: zpack
174 type(accel_kernel_t), public, target, save :: dunpack
175 type(accel_kernel_t), public, target, save :: zunpack
176 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
177 type(accel_kernel_t), public, target, save :: kernel_density_real
178 type(accel_kernel_t), public, target, save :: kernel_density_complex
179 type(accel_kernel_t), public, target, save :: kernel_density_spinors
180 type(accel_kernel_t), public, target, save :: kernel_phase
181 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
182 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
183 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
184 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
185 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
186 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
187 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
188 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
189 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
190 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
191 type(accel_kernel_t), public, target, save :: dzmul
192 type(accel_kernel_t), public, target, save :: zzmul
193
194 interface accel_grid_size
196 end interface accel_grid_size
197
201
202 interface accel_padded_size
204 end interface accel_padded_size
206 interface accel_create_buffer
208 end interface accel_create_buffer
210 interface accel_kernel_run
212 end interface accel_kernel_run
241 end interface accel_write_buffer
242
243 interface accel_read_buffer
258 end interface accel_read_buffer
261 module procedure &
285 module procedure &
291
293 module procedure &
299
300 integer :: buffer_alloc_count
301 integer(int64) :: allocated_mem
302 type(accel_kernel_t), pointer :: head
303 type(alloc_cache_t) :: memcache
304
305contains
306
307 pure logical function accel_is_enabled() result(enabled)
308#ifdef HAVE_ACCEL
309 enabled = accel%enabled
310#else
311 enabled = .false.
312#endif
313 end function accel_is_enabled
314
315 ! ------------------------------------------
316
317 pure logical function accel_allow_cpu_only() result(allow)
318#ifdef HAVE_ACCEL
319 allow = accel%allow_CPU_only
320#else
321 allow = .true.
322#endif
323 end function accel_allow_cpu_only
324
325 ! ------------------------------------------
326
327 subroutine accel_init(base_grp, namespace)
328 type(mpi_grp_t), intent(inout) :: base_grp
329 type(namespace_t), intent(in) :: namespace
330
331 logical :: disable, default, run_benchmark
332 integer :: idevice
333#ifdef HAVE_CUDA
334 integer :: dim
335#ifdef HAVE_MPI
336 character(len=256) :: sys_name
337#endif
338#endif
339
340 push_sub(accel_init)
341
342 buffer_alloc_count = 0
343
344 !%Variable DisableAccel
345 !%Type logical
346 !%Default yes
347 !%Section Execution::Accel
348 !%Description
349 !% If Octopus was compiled with CUDA support, it will
350 !% try to initialize and use an accelerator device. By setting this
351 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
352 !%End
353 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
354#ifdef HAVE_ACCEL
355 default = .false.
356#else
357 default = .true.
358#endif
359 call parse_variable(namespace, 'DisableAccel', default, disable)
360 accel%enabled = .not. disable
361
362#ifndef HAVE_ACCEL
363 if (accel%enabled) then
364 message(1) = 'Octopus was compiled without Cuda support.'
365 call messages_fatal(1)
366 end if
367#endif
368
369 if (.not. accel_is_enabled()) then
370 pop_sub(accel_init)
371 return
372 end if
373
374 call messages_obsolete_variable(namespace, 'AccelPlatform')
375 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
376
377 !%Variable AccelDevice
378 !%Type integer
379 !%Default 0
380 !%Section Execution::Accel
381 !%Description
382 !% This variable selects the GPU that Octopus will use. You can specify a
383 !% numerical id to select a specific device.
384 !%
385 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
386 !% starting at this value.
387 !%End
388 call parse_variable(namespace, 'AccelDevice', 0, idevice)
389
390 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
391
392 if (idevice < 0) then
393 call messages_write('Invalid AccelDevice')
394 call messages_fatal()
395 end if
397 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
399#ifdef HAVE_CUDA
400 if (idevice<0) idevice = 0
401 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
402 idevice, base_grp%rank)
403#ifdef HAVE_MPI
404 call loct_sysname(sys_name)
405 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
406 " on ", trim(sys_name)
407 call messages_info(1, all_nodes = .true.)
408#endif
409
410 call cublas_init(accel%cublas_handle, accel%cuda_stream)
411#endif
413
414 ! Get some device information that we will need later
415#ifdef HAVE_CUDA
416 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
417 call cuda_device_shared_memory(accel%device%cuda_device, accel%shared_memory_size)
418 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_block_size)
419 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
420 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
421 accel%max_block_dim(1) = int(dim, int64)
422 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
423 accel%max_block_dim(2) = int(dim, int64)
424 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
425 accel%max_block_dim(3) = int(dim, int64)
426 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
427 accel%max_grid_dim(1) = int(dim, int64)
428 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
429 accel%max_grid_dim(2) = int(dim, int64)
430 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
431 accel%max_grid_dim(3) = int(dim, int64)
432#endif
433
434 if (base_grp%is_root()) call device_info()
435
436 ! initialize the cache used to speed up allocations
437 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
438
439 ! now initialize the kernels
441
442#if defined(HAVE_HIP)
443 accel%debug_flag = "-g"
444#elif defined(HAVE_CUDA)
445 accel%debug_flag = "-lineinfo"
446#endif
447
448 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
449 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
450 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
451 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
452 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
453 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
454 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
455 flags = ' -DRTYPE_DOUBLE')
456 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
457 flags = '-DRTYPE_COMPLEX')
458 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
459 flags = '-DRTYPE_DOUBLE')
460 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
461 flags = '-DRTYPE_COMPLEX')
462 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
463 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
464 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
465 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
466 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
467 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
468 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
469 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
470 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
471 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
472 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
473 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
474 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
475 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
476 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
477 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
478 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
479
480
481 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
482 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
483
484 ! Define global buffers
485 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
486 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
487 call accel_write_buffer(zm_0_buffer, m_z0)
488 end if
489 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
490 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
491 call accel_write_buffer(zm_1_buffer, m_z1)
492 end if
493 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
494 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
495 call accel_write_buffer(dm_0_buffer, m_zero)
496 end if
497 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
498 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
499 call accel_write_buffer(dm_1_buffer, m_one)
500 end if
501
502
503 !%Variable AccelBenchmark
504 !%Type logical
505 !%Default no
506 !%Section Execution::Accel
507 !%Description
508 !% If this variable is set to yes, Octopus will run some
509 !% routines to benchmark the performance of the accelerator device.
510 !%End
511 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
512
513 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
514
515 if (run_benchmark) then
517 end if
518
519 !%Variable GPUAwareMPI
520 !%Type logical
521 !%Section Execution::Accel
522 !%Description
523 !% If Octopus was compiled with GPU support and MPI support and if the MPI
524 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
525 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
526 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
527 !% going through the host memory.
528 !% The default is false, except when the configure switch --enable-cudampi is set, in which
529 !% case this variable is set to true.
530 !%End
531#ifdef HAVE_CUDA_MPI
532 default = .true.
533#else
534 default = .false.
535#endif
536 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
537 if (accel%cuda_mpi) then
538#ifndef HAVE_CUDA_MPI
539 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
540 call messages_warning()
541#endif
542 call messages_write("Using GPU-aware MPI.")
543 call messages_info()
544 end if
545
546
547 !%Variable AllowCPUonly
548 !%Type logical
549 !%Section Execution::Accel
550 !%Description
551 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
552 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
553 !% code execution also in these cases.
554 !%End
555#if defined (HAVE_ACCEL)
556 default = .false.
557#else
558 default = .true.
559#endif
560 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
561
562
563 !%Variable InitializeGPUBuffers
564 !%Type integer
565 !%Default no
566 !%Section Execution::Accel
567 !%Description
568 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
569 !%Option no 0
570 !% Do not initialize GPU buffers.
571 !%Option yes 1
572 !% Initialize GPU buffers to zero.
573 !%Option nan 2
574 !% Initialize GPU buffers to nan.
575 !%End
576 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
577 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
578 call messages_input_error(namespace, 'InitializeGPUBuffers')
579 end if
580
581
582 call messages_print_with_emphasis(namespace=namespace)
583
584 pop_sub(accel_init)
585
586 contains
587
588 subroutine device_info()
589#ifdef HAVE_CUDA
590 integer :: version
591 character(kind=c_char) :: cval_str(257)
592#endif
593 integer :: major, minor
594 character(len=256) :: val_str
595
596 push_sub(accel_init.device_info)
597
598 call messages_new_line()
599 call messages_write('Selected device:')
600 call messages_new_line()
601
602#ifdef HAVE_CUDA
603#ifdef __HIP_PLATFORM_AMD__
604 call messages_write(' Framework : ROCm')
605#else
606 call messages_write(' Framework : CUDA')
607#endif
608#endif
609 call messages_info()
610
611#ifdef HAVE_CUDA
612 call messages_write(' Device type : GPU', new_line = .true.)
613#ifdef __HIP_PLATFORM_AMD__
614 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
615#else
616 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
617#endif
618#endif
619
620#ifdef HAVE_CUDA
621 cval_str = c_null_char
622 call cuda_device_name(accel%device%cuda_device, cval_str)
623 call string_c_to_f(cval_str, val_str)
624#endif
625 call messages_write(' Device name : '//trim(val_str))
626 call messages_new_line()
627
628#ifdef HAVE_CUDA
629 call cuda_device_capability(accel%device%cuda_device, major, minor)
630#endif
631 call messages_write(' Cuda capabilities :')
632 call messages_write(major, fmt = '(i2)')
633 call messages_write('.')
634 call messages_write(minor, fmt = '(i1)')
635 call messages_new_line()
636
637 ! VERSION
638#ifdef HAVE_CUDA
639 call cuda_driver_version(version)
640 call messages_write(' Driver version : ')
641 call messages_write(version)
642#endif
643 call messages_new_line()
644
645
646 call messages_write(' Device memory :')
647 call messages_write(accel%global_memory_size, units=unit_megabytes)
648 call messages_new_line()
649
650 call messages_write(' Shared memory :')
651 call messages_write(accel%shared_memory_size, units=unit_kilobytes)
652 call messages_new_line()
653
654 call messages_write(' Max. block size :')
655 call messages_write(accel%max_block_size)
656 call messages_new_line()
657
658 call messages_info()
659
660 pop_sub(accel_init.device_info)
661 end subroutine device_info
662
663 end subroutine accel_init
664
665 ! ------------------------------------------
666 subroutine accel_end(namespace)
667 type(namespace_t), intent(in) :: namespace
668
669 integer(int64) :: hits, misses
670 real(real64) :: volume_hits, volume_misses
671 logical :: found
672 type(accel_mem_t) :: tmp
673
674 push_sub(accel_end)
675
676 if (accel_is_enabled()) then
677
678 ! Release global buffers
679 call accel_free_buffer(zm_0_buffer)
680 call accel_free_buffer(zm_1_buffer)
681 call accel_free_buffer(dm_0_buffer)
682 call accel_free_buffer(dm_1_buffer)
683
684 do
685 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
686 if (.not. found) exit
687
688#ifdef HAVE_CUDA
689 call cuda_mem_free(tmp%mem)
690#endif
691 end do
692
693 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
694
695 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
696
697 call messages_new_line()
698 call messages_write(' Number of allocations =')
699 call messages_write(hits + misses, new_line = .true.)
700 call messages_write(' Volume of allocations =')
701 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
702 new_line = .true.)
703 call messages_write(' Hit ratio =')
704 if (hits + misses > 0) then
705 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
706 else
707 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
708 end if
709 call messages_write('%', new_line = .true.)
710 call messages_write(' Volume hit ratio =')
711 if (volume_hits + volume_misses > 0) then
712 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
713 else
714 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
715 end if
716 call messages_write('%')
717 call messages_new_line()
718 call messages_info()
719
720 call messages_print_with_emphasis(namespace=namespace)
721 end if
722
724
725 if (accel_is_enabled()) then
726#ifdef HAVE_CUDA
727 call cublas_end(accel%cublas_handle)
728 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
729 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
730 end if
731#endif
732
733 if (buffer_alloc_count /= 0) then
734 call messages_write('Accel:')
735 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
736 call messages_write(' in ')
737 call messages_write(buffer_alloc_count)
738 call messages_write(' buffers were not deallocated.')
739 call messages_fatal()
740 end if
741
742 end if
743
744 pop_sub(accel_end)
745 end subroutine accel_end
746
747 ! ------------------------------------------
748
750 subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
751 integer(int64), intent(in) :: n(:)
752 integer(int64), intent(in) :: blocksizes(:)
753 integer(int64), intent(out) :: gridsizes(:)
754
755 integer :: dim, i
756
757 dim = ubound(n, dim=1)
758 assert(dim == ubound(blocksizes, dim=1))
759 assert(dim == ubound(gridsizes, dim=1))
760
761 do i = 1, dim
762 gridsizes(i) = (n(i) + blocksizes(i) - 1_int64) / blocksizes(i)
763 gridsizes(i) = min(gridsizes(i), accel%max_grid_dim(i))
764 end do
765 end subroutine accel_grid_size_array_i8
766
767 ! ------------------------------------------
768
770 subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
771 integer, intent(in) :: n(:)
772 integer, intent(in) :: blocksizes(:)
773 integer, intent(out) :: gridsizes(:)
774
775 integer(int64) :: gridsizes64(size(gridsizes))
776
777 call accel_grid_size_array_i8(int(n(:), int64), int(blocksizes(:), int64), gridsizes64)
778
779 gridsizes = int(gridsizes64, int32)
780 end subroutine accel_grid_size_array_i4
781
782 ! ------------------------------------------
783
785 subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
786 integer(int64), intent(in) :: n
787 integer(int64), intent(in) :: blocksizes
788 integer(int64), intent(out) :: gridsizes
789
790 integer(int64) :: temp(1)
792 call accel_grid_size_array_i8( (/n/), (/blocksizes/), temp)
793
794 gridsizes = temp(1)
795 end subroutine accel_grid_size_i8
796
797 ! ------------------------------------------
798
800 subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
801 integer, intent(in) :: n
802 integer, intent(in) :: blocksizes
803 integer, intent(out) :: gridsizes
804
805 integer(int64) :: temp(1)
806
807 call accel_grid_size_array_i8(int((/n/), int64), int((/blocksizes/), int64), temp)
808
809 gridsizes = int(temp(1), int32)
810 end subroutine accel_grid_size_i4
812! ------------------------------------------
813
817 subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
818 integer(int64), intent(in) :: n
819 integer(int64), intent(in) :: pack_size
820 integer(int64), dimension(3), intent(out) :: gridsizes
821 integer(int64), dimension(3), intent(out) :: blocksizes
822 type(accel_kernel_t), optional, intent(inout) :: kernel
823
824 integer(int64) :: bsize, dim2, dim3
825 integer(int64), dimension(3) :: nn
827 if(present(kernel)) then
828 bsize = accel_kernel_block_size(kernel)/pack_size
829 else
830 bsize = accel_max_block_size()/pack_size
831 end if
832
833 dim3 = n/(accel_max_size_per_dim(2)*bsize) + 1
834 dim2 = min(accel_max_size_per_dim(2)*bsize, pad(n, bsize))
835
836 nn = (/pack_size, dim2, dim3/)
837 blocksizes = (/pack_size, bsize, 1_int64/)
838
839 call accel_grid_size(nn, blocksizes, gridsizes)
840 end subroutine accel_grid_size_extend_dim_i8
842 ! ------------------------------------------
843
847 subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
848 integer, intent(in) :: n
849 integer, intent(in) :: pack_size
850 integer, dimension(3), intent(out) :: gridsizes
851 integer, dimension(3), intent(out) :: blocksizes
852 type(accel_kernel_t), optional, intent(inout) :: kernel
853
854 integer(int64) :: gridsizes64(3), blocksizes64(3)
855
856 call accel_grid_size_extend_dim_i8(int(n, int64), int(pack_size, int64), &
857 gridsizes64, blocksizes64, kernel=kernel)
859 gridsizes = int(gridsizes64, int32)
860 blocksizes = int(blocksizes64, int32)
861 end subroutine accel_grid_size_extend_dim_i4
862
863 ! ------------------------------------------
864
865 integer(int64) function accel_padded_size_i8(nn) result(psize)
866 integer(int64), intent(in) :: nn
867
868 integer(int64) :: modnn, bsize
869
870 psize = nn
871
872 if (accel_is_enabled()) then
873
874 bsize = accel_max_block_size()
875
876 psize = nn
877 modnn = mod(nn, bsize)
878 if (modnn /= 0) psize = psize + bsize - modnn
879
880 end if
881
882 end function accel_padded_size_i8
883
884 ! ------------------------------------------
885
886 integer(int32) function accel_padded_size_i4(nn) result(psize)
887 integer(int32), intent(in) :: nn
889 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
890
891 end function accel_padded_size_i4
892
893 ! ------------------------------------------
894
895 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
896 type(accel_mem_t), intent(inout) :: this
897 integer, intent(in) :: flags
898 type(type_t), intent(in) :: type
899 integer, intent(in) :: size
900 logical, optional, intent(in) :: set_zero
901 logical, optional, intent(in) :: async
902
903 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
904 end subroutine accel_create_buffer_4
905
906 ! ------------------------------------------
907
908 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
909 type(accel_mem_t), intent(inout) :: this
910 integer, intent(in) :: flags
911 type(type_t), intent(in) :: type
912 integer(int64), intent(in) :: size
913 logical, optional, intent(in) :: set_zero
914 logical, optional, intent(in) :: async
915
916 integer(int64) :: fsize
917 logical :: found
918 integer(int64) :: initialize_buffers
919
920 push_sub(accel_create_buffer_8)
921
922 this%type = type
923 this%size = size
924 this%flags = flags
925 fsize = int(size, int64)*types_get_size(type)
926 this%allocated = .true.
928 if (fsize > 0) then
929
930 call alloc_cache_get(memcache, fsize, found, this%mem)
931
932 if (.not. found) then
933#ifdef HAVE_CUDA
934 if(optional_default(async, .false.)) then
935 call cuda_mem_alloc_async(this%mem, fsize)
936 else
937 call cuda_mem_alloc(this%mem, fsize)
938 end if
939#endif
940 end if
941
942 buffer_alloc_count = buffer_alloc_count + 1
943 allocated_mem = allocated_mem + fsize
944
945 end if
946
947 if (present(set_zero)) then
948 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
949 else
950 initialize_buffers = accel%initialize_buffers
951 end if
952 select case (initialize_buffers)
953 case (option__initializegpubuffers__yes)
954 call accel_set_buffer_to(this, type, int(z'00', int8), size)
955 case (option__initializegpubuffers__nan)
956 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
957 end select
958
959 pop_sub(accel_create_buffer_8)
960 end subroutine accel_create_buffer_8
961
962 ! ------------------------------------------
963
964 subroutine accel_free_buffer(this, async)
965 type(accel_mem_t), intent(inout) :: this
966 logical, optional, intent(in) :: async
967
968 logical :: put
969 integer(int64) :: fsize
970
971 push_sub(accel_free_buffer)
972
973 if (this%size > 0) then
974
975 fsize = int(this%size, int64)*types_get_size(this%type)
976
977 call alloc_cache_put(memcache, fsize, this%mem, put)
978
979 if (.not. put) then
980#ifdef HAVE_CUDA
981 if (optional_default(async, .false.)) then
982 call cuda_mem_free_async(this%mem)
983 else
984 call cuda_mem_free(this%mem)
985 end if
986#endif
987 end if
988
989 buffer_alloc_count = buffer_alloc_count - 1
990 allocated_mem = allocated_mem + fsize
991
992 end if
993
994 this%size = 0
995 this%flags = 0
996
997 this%allocated = .false.
998
999 pop_sub(accel_free_buffer)
1000 end subroutine accel_free_buffer
1001
1002 ! ------------------------------------------
1003
1007 subroutine accel_move_buffer(buffer_from, buffer_to)
1008 type(accel_mem_t), intent(inout) :: buffer_from
1009 type(accel_mem_t), intent(inout) :: buffer_to
1010
1011 push_sub(accel_move_buffer)
1012
1013 ! Can not move an allocated buffer, the destination buffer should be empty
1014 assert(.not. accel_buffer_is_allocated(buffer_to))
1015
1016 buffer_to%mem = buffer_from%mem
1017 buffer_to%size = buffer_from%size
1018 buffer_to%type = buffer_from%type
1019 buffer_to%flags = buffer_from%flags
1020 buffer_to%allocated = buffer_from%allocated
1021
1022 call accel_detach_buffer(buffer_from)
1023
1024 pop_sub(accel_move_buffer)
1025 end subroutine accel_move_buffer
1026
1027 ! ------------------------------------------
1028
1033 subroutine accel_detach_buffer(this)
1034 type(accel_mem_t), intent(inout) :: this
1035
1036 push_sub(accel_detach_buffer)
1037
1038 this%mem = c_null_ptr
1039 this%size = 0
1040 this%type = type_none
1041 this%flags = 0
1042 this%allocated = .false.
1043
1044 pop_sub(accel_detach_buffer)
1045 end subroutine accel_detach_buffer
1046
1047 ! ------------------------------------------------------
1049 ! Check if the temporary buffers are the right size, if not reallocate them
1050 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1051 type(accel_mem_t), intent(inout) :: buffer
1052 integer, intent(in) :: flags
1053 type(type_t), intent(in) :: type
1054 integer, intent(in) :: required_size
1055 logical, intent(in) :: set_zero
1056 logical, optional, intent(in) :: async
1057
1058 push_sub(accel_ensure_buffer_size)
1059
1060
1061 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
1062 call accel_free_buffer(buffer, async=optional_default(async, .false.))
1063 end if
1064
1065 if (.not. accel_buffer_is_allocated(buffer)) then
1066 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1067 end if
1068
1070 end subroutine accel_ensure_buffer_size
1071
1072 ! ------------------------------------------
1073
1074 logical pure function accel_buffer_is_allocated(this) result(allocated)
1075 type(accel_mem_t), intent(in) :: this
1076
1077 allocated = this%allocated
1078 end function accel_buffer_is_allocated
1079
1080 ! -----------------------------------------
1081
1082 subroutine accel_finish()
1083 ! no push_sub, called too frequently
1084
1085 if (accel_is_enabled()) then
1086#ifdef HAVE_CUDA
1088#endif
1089 end if
1090 end subroutine accel_finish
1092 ! ------------------------------------------
1093
1094 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1095 type(accel_kernel_t), intent(inout) :: kernel
1096 integer, intent(in) :: narg
1097 type(accel_mem_t), intent(in) :: buffer
1098
1099 assert(accel_buffer_is_allocated(buffer))
1100
1101 ! no push_sub, called too frequently
1102#ifdef HAVE_CUDA
1103 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1104#endif
1105
1106 end subroutine accel_set_kernel_arg_buffer
1107
1108 ! ------------------------------------------
1109
1116 subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
1117 type(accel_kernel_t), intent(inout) :: kernel
1118 integer(int64), intent(in) :: gridsizes(:)
1119 integer(int64), intent(in) :: blocksizes(:)
1120 integer(int64), optional, intent(in) :: shared_memory_size
1121
1122 integer :: dim
1123 integer(int64) :: gsizes(1:3)
1124 integer(int64) :: bsizes(1:3)
1125
1126 ! no push_sub, called too frequently
1127
1128 ! CUDA needs all dimensions
1129 gsizes = 1
1130 bsizes = 1
1131
1132 dim = ubound(gridsizes, dim=1)
1133
1134 assert(dim == ubound(blocksizes, dim=1))
1136 ! if one size is zero, there is nothing to do
1137 if (any(gridsizes == 0)) return
1138
1139 assert(all(blocksizes > 0))
1140
1141 gsizes(1:dim) = gridsizes(1:dim)
1142 bsizes(1:dim) = blocksizes(1:dim)
1143
1144#ifdef HAVE_CUDA
1145 ! Maximum dimension of a block
1146 if (any(bsizes(1:3) > accel%max_block_dim(1:3))) then
1147 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1148 message(2) = "The following conditions should be fulfilled:"
1149 write(message(3), "(A, I8, A, I8)") "Dim 1: ", bsizes(1), " <= ", accel%max_block_dim(1)
1150 write(message(4), "(A, I8, A, I8)") "Dim 2: ", bsizes(2), " <= ", accel%max_block_dim(2)
1151 write(message(5), "(A, I8, A, I8)") "Dim 3: ", bsizes(3), " <= ", accel%max_block_dim(3)
1152 message(6) = "This is an internal error, please contact the developers."
1153 call messages_fatal(6)
1154 end if
1155
1156
1157 ! Maximum number of threads per block
1158 if (product(bsizes) > accel_max_block_size()) then
1159 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1160 message(2) = "The following condition should be fulfilled:"
1161 write(message(3), "(I8, A, I8)") product(bsizes), " <= ", accel_max_block_size()
1162 message(4) = "This is an internal error, please contact the developers."
1163 call messages_fatal(4)
1164 end if
1165
1166 ! Maximum dimensions of the grid of thread block
1167 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1168 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1169 message(2) = "The following conditions should be fulfilled:"
1170 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1171 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1172 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1173 message(6) = "This is an internal error, please contact the developers."
1174 call messages_fatal(6)
1175 end if
1176
1177 if(present(shared_memory_size)) then
1178
1179 if (shared_memory_size > accel%shared_memory_size) then
1180 message(1) = "Shared memory too large in kernel "//trim(kernel%kernel_name)
1181 message(2) = "The following condition should be fulfilled:"
1182 message(3) = "Requested shared memory <= Available shared memory"
1183 write(message(4), '(a,f12.6,a)') "Requested shared memory: ", real(shared_memory_size, real64) /1024.0, " Kb"
1184 write(message(5), '(a,f12.6,a)') "Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0, " Kb"
1185 message(6) = "This is an internal error, please contact the developers."
1186 call messages_fatal(6)
1187 else if (shared_memory_size <= 0) then
1188 message(1) = "Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1189 write(message(2), '(a,f12.6,a)') "Shared memory size requested: ", real(shared_memory_size, real64) /1024.0, " Kb"
1190 message(3) = "This is an internal error, please contact the developers."
1191 call messages_fatal(3)
1192 end if
1193 end if
1194
1195 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), bsizes(1), &
1196 optional_default(shared_memory_size, 0_int64), kernel%arguments)
1197#endif
1198
1199 end subroutine accel_kernel_run_8
1200
1201 ! -----------------------------------------------
1202
1208
1209 subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
1210 type(accel_kernel_t), intent(inout) :: kernel
1211 integer, intent(in) :: gridsizes(:)
1212 integer, intent(in) :: blocksizes(:)
1213 integer(int64), optional, intent(in) :: shared_memory_size
1214
1215 call accel_kernel_run_8(kernel, int(gridsizes, int64), int(blocksizes, int64), shared_memory_size)
1216
1217 end subroutine accel_kernel_run_4
1218
1219 ! -----------------------------------------------
1220
1221 integer pure function accel_max_block_size() result(max_block_size)
1222 max_block_size = accel%max_block_size
1223 end function accel_max_block_size
1224
1225 ! -----------------------------------------------
1227 integer function accel_kernel_block_size(kernel) result(block_size)
1228 type(accel_kernel_t), intent(inout) :: kernel
1229
1230#ifdef HAVE_CUDA
1231 integer :: max_block_size
1232#endif
1233
1234 block_size = 0
1235
1236#ifdef HAVE_CUDA
1237 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_block_size)
1238 if (debug%info .and. max_block_size /= accel%max_block_size) then
1239 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", max_block_size, ")", &
1240 "than available on the device (", accel%max_block_size, ")"
1241 call messages_info(1)
1242 end if
1243
1244 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1245 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1246 block_size = 256
1247
1248 ! make sure we do not use more threads per block than available for this kernel
1249 block_size = min(block_size, max_block_size)
1250#endif
1251
1252 end function accel_kernel_block_size
1253
1254 ! ----------------------------------------------------
1255
1256 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1257 type(accel_mem_t), intent(inout) :: buffer
1258 type(type_t), intent(in) :: type
1259 integer(int8), intent(in) :: val
1260 integer(int64), intent(in) :: nval
1261 integer(int64), optional, intent(in) :: offset
1262 logical, optional, intent(in) :: async
1263
1264 integer(int64) :: nval_, offset_, type_size
1265
1266 push_sub(accel_set_buffer_to)
1268 if (nval == 0) then
1269 pop_sub(accel_set_buffer_to)
1270 return
1271 end if
1272 assert(nval > 0)
1273
1274 if (present(offset)) then
1275 assert(offset >= 0)
1276 if(offset > buffer%size) then
1277 pop_sub(accel_set_buffer_to)
1278 return
1279 end if
1280 end if
1281
1282 type_size = types_get_size(type)
1284 nval_ = nval*type_size
1285
1286 offset_ = 0_int64
1287 if (present(offset)) offset_ = offset*type_size
1288
1289 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1290 if(.not. optional_default(async, .false.)) call accel_finish()
1291
1292 pop_sub(accel_set_buffer_to)
1293 end subroutine accel_set_buffer_to
1294
1295 ! ----------------------------------------------------
1296
1297 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1298 type(accel_mem_t), intent(inout) :: buffer
1299 type(type_t), intent(in) :: type
1300 integer(int64), intent(in) :: nval
1301 integer(int64), optional, intent(in) :: offset
1302 logical, optional, intent(in) :: async
1305
1306 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1307
1309 end subroutine accel_set_buffer_to_zero_i8
1310
1311 ! ----------------------------------------------------
1312
1313 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1314 type(accel_mem_t), intent(inout) :: buffer
1315 type(type_t), intent(in) :: type
1316 integer(int32), intent(in) :: nval
1317 integer(int32), optional, intent(in) :: offset
1318 logical, optional, intent(in) :: async
1319
1321
1322 if (present(offset)) then
1323 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1324 else
1325 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1326 end if
1327
1329 end subroutine accel_set_buffer_to_zero_i4
1330
1331 ! ----------------------------------------------------
1332
1333 subroutine accel_check_bandwidth()
1334 integer :: itime
1335 integer, parameter :: times = 10
1336 integer :: size
1337 real(real64) :: time, stime
1338 real(real64) :: read_bw, write_bw
1339 type(accel_mem_t) :: buff
1340 real(real64), allocatable :: data(:)
1341
1342 call messages_new_line()
1343 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1344 call messages_new_line()
1345 call messages_info()
1346
1347 call messages_write(' Buffer size Read bw Write bw')
1348 call messages_new_line()
1349 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1350 call messages_info()
1351
1352 size = 15000
1353 do
1354 safe_allocate(data(1:size))
1355 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1356
1357 stime = loct_clock()
1358 do itime = 1, times
1359 call accel_write_buffer(buff, size, data)
1360 call accel_finish()
1361 end do
1362 time = (loct_clock() - stime)/real(times, real64)
1363
1364 write_bw = real(size, real64) *8.0_real64/time
1365
1366 stime = loct_clock()
1367 do itime = 1, times
1368 call accel_read_buffer(buff, size, data)
1369 end do
1370 call accel_finish()
1371
1372 time = (loct_clock() - stime)/real(times, real64)
1373 read_bw = real(size, real64) *8.0_real64/time
1374
1375 call messages_write(size*8.0_real64/1024.0_real64**2)
1376 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1377 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1378 call messages_info()
1379
1380 call accel_free_buffer(buff)
1381
1382 safe_deallocate_a(data)
1383
1384 size = int(size*2.0)
1385
1386 if (size > 50000000) exit
1387 end do
1388 end subroutine accel_check_bandwidth
1389
1390 !------------------------------------------------------------
1391
1392 subroutine accel_kernel_global_init()
1393
1394 push_sub(accel_kernel_global_init)
1395
1396 nullify(head)
1397
1398 call cuda_module_map_init(accel%module_map)
1399
1401 end subroutine accel_kernel_global_init
1402
1403 !------------------------------------------------------------
1404
1405 subroutine accel_kernel_global_end()
1406 type(accel_kernel_t), pointer :: next_head
1407
1408 push_sub(accel_kernel_global_end)
1409
1410 do
1411 if (.not. associated(head)) exit
1412 next_head => head%next
1414 head => next_head
1415 end do
1416
1417 if (accel_is_enabled()) then
1418 call cuda_module_map_end(accel%module_map)
1419 end if
1422 end subroutine accel_kernel_global_end
1423
1424 !------------------------------------------------------------
1425
1426 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1427 type(accel_kernel_t), intent(inout) :: this
1428 character(len=*), intent(in) :: file_name
1429 character(len=*), intent(in) :: kernel_name
1430 character(len=*), optional, intent(in) :: flags
1431
1432#ifdef HAVE_CUDA
1433 character(len=1000) :: all_flags
1434#endif
1435
1436 push_sub(accel_kernel_build)
1437
1438 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1439
1440#ifdef HAVE_CUDA
1441 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1442
1443 if (present(flags)) then
1444 all_flags = trim(all_flags)//' '//trim(flags)
1445 end if
1446
1447 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1448 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1449
1450 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1451 call cuda_alloc_arg_array(this%arguments)
1452#endif
1453
1454 this%initialized = .true.
1455 this%kernel_name = trim(kernel_name)
1456
1457 call profiling_out("ACCEL_COMPILE")
1458
1459 pop_sub(accel_kernel_build)
1460 end subroutine accel_kernel_build
1461
1462 !------------------------------------------------------------
1463
1464 subroutine accel_kernel_end(this)
1465 type(accel_kernel_t), intent(inout) :: this
1466
1467 push_sub(accel_kernel_end)
1468
1469#ifdef HAVE_CUDA
1470 call cuda_free_arg_array(this%arguments)
1471 call cuda_release_kernel(this%cuda_kernel)
1472 ! modules are not released here, since they are not associated to a kernel
1473#endif
1474
1475 this%initialized = .false.
1476
1477 pop_sub(accel_kernel_end)
1478 end subroutine accel_kernel_end
1479
1480 !------------------------------------------------------------
1481
1482 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1483 type(accel_kernel_t), target, intent(inout) :: this
1484 character(len=*), intent(in) :: file_name
1485 character(len=*), intent(in) :: kernel_name
1486 character(len=*), optional, intent(in) :: flags
1487
1488 push_sub(accel_kernel_start_call)
1489
1490 if (.not. this%initialized) then
1491 call accel_kernel_build(this, file_name, kernel_name, flags)
1492 this%next => head
1493 head => this
1494 end if
1495
1497 end subroutine accel_kernel_start_call
1498
1499 !--------------------------------------------------------------
1500
1501 integer(int64) pure function accel_global_memory_size() result(size)
1502
1503 size = accel%global_memory_size
1504
1505 end function accel_global_memory_size
1506
1507 !--------------------------------------------------------------
1508
1509 integer(int64) pure function accel_shared_memory_size() result(size)
1510
1511 size = accel%shared_memory_size
1512
1513 end function accel_shared_memory_size
1514 !--------------------------------------------------------------
1515
1516 integer pure function accel_max_size_per_dim(dim) result(size)
1517 integer, intent(in) :: dim
1518
1519 size = 0
1520#ifdef HAVE_CUDA
1521 size = 32768
1522 if (dim == 1) size = 2**30
1523#endif
1524 end function accel_max_size_per_dim
1525
1526 ! ------------------------------------------------------
1527
1528 subroutine accel_set_stream(stream_number)
1529 integer, intent(in) :: stream_number
1530
1531 push_sub(accel_set_stream)
1532
1533 if (accel_is_enabled()) then
1534#ifdef HAVE_CUDA
1535 call cuda_set_stream(accel%cuda_stream, stream_number)
1536 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1537#endif
1538 end if
1539
1540 pop_sub(accel_set_stream)
1541 end subroutine accel_set_stream
1542
1543 ! ------------------------------------------------------
1545 subroutine accel_get_stream(stream_number)
1546 integer, intent(inout) :: stream_number
1547
1548 push_sub(accel_get_stream)
1549
1550 if (accel_is_enabled()) then
1551#ifdef HAVE_CUDA
1552 call cuda_get_stream(stream_number)
1553#endif
1554 end if
1555
1556 pop_sub(accel_get_stream)
1557 end subroutine accel_get_stream
1558
1559 ! ------------------------------------------------------
1560
1563
1564 if (accel_is_enabled()) then
1565#ifdef HAVE_CUDA
1566 call cuda_synchronize_all_streams()
1567#endif
1568 end if
1569
1571 end subroutine accel_synchronize_all_streams
1573 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1574 type(c_ptr), intent(in) :: buffer
1575 integer(int64), intent(in) :: offset
1576 type(c_ptr) :: buffer_offset
1577
1579#ifdef HAVE_CUDA
1580 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1581#else
1582 ! this is needed to make the compiler happy for non-GPU compilations
1583 buffer_offset = buffer
1584#endif
1587
1588 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1589 type(c_ptr), intent(in) :: buffer
1590 integer(int64), intent(in) :: offset
1591 type(c_ptr) :: buffer_offset
1592
1594#ifdef HAVE_CUDA
1595 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1596#else
1597 ! this is needed to make the compiler happy for non-GPU compilations
1598 buffer_offset = buffer
1599#endif
1602
1603 subroutine accel_clean_pointer(buffer)
1604 type(c_ptr), intent(in) :: buffer
1605
1606 push_sub(accel_clean_pointer)
1607#ifdef HAVE_CUDA
1608 call cuda_clean_pointer(buffer)
1609#endif
1610 pop_sub(accel_clean_pointer)
1611 end subroutine accel_clean_pointer
1612
1616 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1617 integer(int64), intent(in) :: size
1618 integer(int64), intent(out) :: grid_size
1619 integer(int64), intent(out) :: thread_block_size
1620
1621 push_sub(accel_get_unfolded_size)
1622#ifdef __HIP_PLATFORM_AMD__
1623 ! not benefitial for AMD chips
1624 grid_size = 1_int64
1625 thread_block_size = size
1626#else
1627 grid_size = size
1628 thread_block_size = accel%warp_size
1629#endif
1631 end subroutine accel_get_unfolded_size
1632
1633#include "undef.F90"
1634#include "real.F90"
1635#include "accel_inc.F90"
1636
1637#include "undef.F90"
1638#include "complex.F90"
1639#include "accel_inc.F90"
1640
1641#include "undef.F90"
1642#include "integer.F90"
1643#include "accel_inc.F90"
1644
1645#include "undef.F90"
1646#include "integer8.F90"
1647#include "accel_inc.F90"
1648
1649end module accel_oct_m
1650
1651!! Local Variables:
1652!! mode: f90
1653!! coding: utf-8
1654!! End:
subroutine device_info()
Definition: accel.F90:651
subroutine accel_grid_size_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:842
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5919
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3692
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1560
subroutine accel_kernel_global_end()
Definition: accel.F90:1376
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2862
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1573
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5715
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5470
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3932
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:413
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2498
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3512
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2216
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3599
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5494
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:284
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3741
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1974
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5065
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1946
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5802
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3345
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3177
integer function, public accel_kernel_block_size(kernel)
Definition: accel.F90:1214
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2759
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2135
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4817
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3236
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5826
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4148
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:281
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3141
subroutine accel_grid_size_array_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:792
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5310
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3267
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3913
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3768
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5143
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1006
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1785
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4071
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1439
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3963
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2912
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4869
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4553
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3008
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1502
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:937
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3394
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4041
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1458
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2190
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5015
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:283
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2116
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1730
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3583
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2615
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4368
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2591
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:907
subroutine accel_check_bandwidth()
Definition: accel.F90:1304
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4176
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2445
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:5380
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1671
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4842
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3375
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4768
subroutine, public accel_finish()
Definition: accel.F90:1124
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5278
subroutine accel_kernel_global_init()
Definition: accel.F90:1363
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3452
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2244
subroutine accel_kernel_run_4(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
Definition: accel.F90:1196
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2812
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4700
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5749
subroutine accel_grid_size_extend_dim_i4(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:889
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5944
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:1092
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1227
subroutine, public accel_move_buffer(buffer_from, buffer_to)
Move the buffer memory from the first buffer to the second.
Definition: accel.F90:1049
subroutine, public accel_detach_buffer(this)
Clear a buffer handle without freeing device memory.
Definition: accel.F90:1075
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:5786
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4013
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1711
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2274
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3217
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4208
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2166
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4962
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5548
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2522
subroutine accel_kernel_run_8(kernel, gridsizes, blocksizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
Definition: accel.F90:1158
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4514
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2970
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2989
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1268
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3667
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1116
integer, parameter, public accel_mem_read_write
Definition: accel.F90:186
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2640
subroutine accel_kernel_end(this)
Definition: accel.F90:1421
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:282
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5250
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3107
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3623
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1761
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5616
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1530
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3860
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4724
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1473
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3291
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4278
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2076
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4495
subroutine accel_grid_size_array_i4(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (32-bit version).
Definition: accel.F90:812
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4613
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5089
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5173
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2831
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5578
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5895
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4746
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2886
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3047
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2482
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4318
subroutine accel_grid_size_extend_dim_i8(n, pack_size, gridsizes, blocksizes, kernel)
Helper function to compute the grid for the kernels that relies on the batch size (pack_size) and the...
Definition: accel.F90:859
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1397
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:423
subroutine, public accel_end(namespace)
Definition: accel.F90:708
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4975
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1839
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3317
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1518
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1485
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5439
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2667
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4581
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4392
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5870
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4476
subroutine accel_grid_size_i8(n, blocksizes, gridsizes)
Computes the grid size for a given problem size and block size (64-bit version).
Definition: accel.F90:827
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4242
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:928
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1284
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2040
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:285
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5192
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4793
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4109
pure logical function, public accel_is_enabled()
Definition: accel.F90:403
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3546
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1869
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3987
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1907
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4090
integer, parameter, public accel_mem_write_only
Definition: accel.F90:186
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3075
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2351
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5344
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5597
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2293
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5683
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1811
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2566
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5971
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4337
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5115
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5034
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2006
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5211
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3645
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1545
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4647
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5655
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1658
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1888
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2772
type(accel_t), public accel
Definition: accel.F90:251
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5848
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4446
integer(int64) pure function, public accel_shared_memory_size()
Definition: accel.F90:1466
integer pure function, public accel_max_block_size()
Definition: accel.F90:1208
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4684
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2312
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4418
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3873
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3716
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:950
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5420
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2411
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2379
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1136
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5520
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3413
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2940
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2544
type(accel_kernel_t), pointer head
Definition: accel.F90:397
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3480
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:200
complex(real64), parameter, public m_z0
Definition: global.F90:210
complex(real64), parameter, public m_z1
Definition: global.F90:211
real(real64), parameter, public m_one
Definition: global.F90:201
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1000
subroutine, public messages_new_line()
Definition: messages.F90:1089
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), parameter, public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
type(type_t), parameter, public type_float
Definition: types.F90:135
type(type_t), parameter, public type_none
Definition: types.F90:134
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)