LCOV - code coverage report
Current view: top level - src/elpa2 - pack_unpack_gpu.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 0 37 0.0 %
Date: 2018-01-10 09:29:53 Functions: 0 20 0.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : !      Informatik,
      10             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : !      and
      16             : !    - IBM Deutschland GmbH
      17             : !
      18             : !
      19             : !    More information can be found here:
      20             : !    http://elpa.rzg.mpg.de/
      21             : !
      22             : !    ELPA is free software: you can redistribute it and/or modify
      23             : !    it under the terms of the version 3 of the license of the
      24             : !    GNU Lesser General Public License as published by the Free
      25             : !    Software Foundation.
      26             : !
      27             : !    ELPA is distributed in the hope that it will be useful,
      28             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      29             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      30             : !    GNU Lesser General Public License for more details.
      31             : !
      32             : !    You should have received a copy of the GNU Lesser General Public License
      33             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      34             : !
      35             : !    ELPA reflects a substantial effort on the part of the original
      36             : !    ELPA consortium, and we ask you to respect the spirit of the
      37             : !    license that we chose: i.e., please contribute any changes you
      38             : !    may have back to the original ELPA library distribution, and keep
      39             : !    any derivatives of ELPA under the same license that we chose for
      40             : !    the original distribution, the GNU Lesser General Public License.
      41             : !
      42             : ! This file was written by A. Marek, MPCDF
      43             : #endif
      44             : 
      45             :     ! Pack a filled row group (i.e. an array of consecutive rows)
      46             : 
      47             :     subroutine pack_row_group_&
      48             :     &MATH_DATATYPE&
      49             :     &_gpu_&
      50           0 :     &PRECISION &
      51             :     (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
      52           0 :                                        rows, n_offset, row_count)
      53             :       use cuda_c_kernel
      54             :       use cuda_functions
      55             :       use precision
      56             :       use iso_c_binding
      57             :       implicit none
      58             :       integer(kind=c_intptr_t)     :: row_group_dev, a_dev
      59             : 
      60             :       integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      61             :       integer(kind=ik), intent(in) :: n_offset, row_count
      62             : #if REALCASE == 1
      63             :       real(kind=C_DATATYPE_KIND)   :: rows(:,:)
      64             : #endif
      65             : #if COMPLEXCASE == 1
      66             :       complex(kind=C_DATATYPE_KIND):: rows(:,:)
      67             : #endif
      68             :       integer(kind=ik)             :: max_idx
      69             :       logical                      :: successCUDA
      70             : 
      71             :       ! Use many blocks for higher GPU occupancy
      72           0 :       max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
      73             : 
      74             :       ! Use one kernel call to pack the entire row group
      75             : 
      76             : !      call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
      77             : 
      78             :       call launch_my_pack_gpu_kernel_&
      79             :       &MATH_DATATYPE&
      80             :       &_&
      81             :       &PRECISION &
      82           0 :       (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
      83             : 
      84             :       ! Issue one single transfer call for all rows (device to host)
      85             : !        rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
      86             : 
      87             :       successCUDA =  cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_&
      88             :       &PRECISION&
      89             :       &_&
      90             :       &MATH_DATATYPE&
      91           0 :       & , cudaMemcpyDeviceToHost)
      92           0 :       if (.not.(successCUDA)) then
      93             :         print *,"pack_row_group_&
      94             :         &MATH_DATATYPE&
      95             :         &_gpu_&
      96             :         &PRECISION&
      97           0 :         &: error in cudaMemcpy"
      98           0 :         stop 1
      99             :       endif
     100             : 
     101           0 :     end subroutine
     102             : 
     103             : 
     104             :     ! Unpack a filled row group (i.e. an array of consecutive rows)
     105             :     subroutine unpack_row_group_&
     106             :     &MATH_DATATYPE&
     107             :     &_gpu_&
     108           0 :     &PRECISION &
     109             :     (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
     110           0 :                                          a_dim2, l_nev, rows, n_offset, row_count)
     111             :       use cuda_c_kernel
     112             :       use precision
     113             :       use iso_c_binding
     114             :       use cuda_functions
     115             :       implicit none
     116             :       integer(kind=c_intptr_t)                     :: row_group_dev, a_dev
     117             :       integer(kind=ik), intent(in)                 :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
     118             :       integer(kind=ik), intent(in)                 :: n_offset, row_count
     119             : #if REALCASE == 1
     120             :       real(kind=C_DATATYPE_KIND), intent(in)       :: rows(:, :)
     121             : #endif
     122             : #if COMPLEXCASE == 1
     123             :       complex(kind=C_DATATYPE_KIND), intent(in)    :: rows(:, :)
     124             : #endif
     125             : 
     126             :       integer(kind=ik)                             :: max_idx
     127             :       logical                                      :: successCUDA
     128             : 
     129             :       ! Use many blocks for higher GPU occupancy
     130           0 :       max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
     131             : 
     132             :       ! Issue one single transfer call for all rows (host to device)
     133             : !      row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
     134             : 
     135             : 
     136             :       successCUDA =  cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
     137             :                                  size_of_&
     138             :                                  &PRECISION&
     139             :                                  &_&
     140             :                                  &MATH_DATATYPE&
     141           0 :                                  &, cudaMemcpyHostToDevice)
     142           0 :       if (.not.(successCUDA)) then
     143             :         print *,"unpack_row_group_&
     144             :         &MATH_DATATYPE&
     145             :         &_gpu_&
     146             :         &PRECISION&
     147           0 :         &: error in cudaMemcpy"
     148           0 :         stop 1
     149             :       endif
     150             : 
     151             :       ! Use one kernel call to pack the entire row group
     152             :       !        call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
     153             : 
     154             :       call launch_my_unpack_gpu_kernel_&
     155             :       &MATH_DATATYPE&
     156             :       &_&
     157             :       &PRECISION &
     158             :       ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
     159           0 :                                           row_group_dev,a_dev)
     160             : 
     161           0 :     end subroutine
     162             : 
     163             :     ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
     164             :     ! occurs when the queue is full or when the next row belongs to another group
     165             :     subroutine unpack_and_prepare_row_group_&
     166             :     &MATH_DATATYPE&
     167             :     &_gpu_&
     168           0 :     &PRECISION &
     169           0 :     (row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
     170             :                                                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
     171             :                                                      unpack_idx, next_unpack_idx, force)
     172             : 
     173             :       use iso_c_binding
     174             :       use precision
     175             :       implicit none
     176             : #if REALCASE == 1
     177             :       real(kind=C_DATATYPE_KIND)      :: row_group(:,:)
     178             : #endif
     179             : #if COMPLEXCASE == 1
     180             :       complex(kind=C_DATATYPE_KIND)   :: row_group(:,:)
     181             : #endif
     182             :       integer(kind=c_intptr_t)        :: row_group_dev, a_dev
     183             :       integer(kind=ik), intent(in)    :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
     184             :       integer(kind=ik), intent(inout) :: row_group_size
     185             :       integer(kind=ik), intent(in)    :: nblk
     186             :       integer(kind=ik), intent(inout) :: unpack_idx
     187             :       integer(kind=ik), intent(in)    :: next_unpack_idx
     188             :       logical, intent(in)             :: force
     189             : 
     190           0 :       if (row_group_size == 0) then
     191             :         ! Nothing to flush, just prepare for the upcoming row
     192           0 :         row_group_size = 1
     193             :       else
     194           0 :         if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
     195             :           ! A flush and a reset must be performed
     196             :           call unpack_row_group_&
     197             :           &MATH_DATATYPE&
     198             :           &_gpu_&
     199             :           &PRECISION&
     200             :           (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
     201           0 :                                          a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
     202           0 :           row_group_size = 1
     203             :         else
     204             :           ! Just prepare for the upcoming row
     205           0 :           row_group_size = row_group_size + 1
     206             :         endif
     207             :       endif
     208             :       ! Always update the index for the upcoming row
     209           0 :       unpack_idx = next_unpack_idx
     210           0 :     end subroutine
     211             : 
     212             :     ! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
     213             :     subroutine compute_hh_dot_products_&
     214             :     &MATH_DATATYPE&
     215             :     &_gpu_&
     216           0 :     &PRECISION&
     217             :     & (bcast_buffer_dev, hh_dot_dev, nbw, n)
     218             :       use cuda_c_kernel
     219             :       use precision
     220             :       use iso_c_binding
     221             :       implicit none
     222             :       integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_dot_dev
     223             :       integer(kind=ik), value  :: nbw, n
     224             : 
     225           0 :       if (n .le. 1) return
     226             :       call launch_compute_hh_dotp_gpu_kernel_&
     227             :       &MATH_DATATYPE&
     228             :       &_&
     229             :       &PRECISION&
     230           0 :       & ( bcast_buffer_dev, hh_dot_dev, nbw, n)
     231             : 
     232             : 
     233             :     end subroutine
     234             : 
     235             :     ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
     236             :     subroutine extract_hh_tau_&
     237             :     &MATH_DATATYPE&
     238             :     &_gpu_&
     239           0 :     &PRECISION&
     240             :     & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
     241             :       use cuda_c_kernel
     242             :       use precision
     243             :       use iso_c_binding
     244             :       implicit none
     245             :       integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
     246             :       integer(kind=ik), value  :: nbw, n
     247             :       logical, value           :: is_zero
     248             :       integer(kind=ik)         :: val_is_zero
     249           0 :       if (is_zero) then
     250           0 :       val_is_zero = 1
     251             :       else
     252           0 :        val_is_zero = 0
     253             :       endif
     254             : 
     255             :       call launch_extract_hh_tau_gpu_kernel_&
     256             :       &MATH_DATATYPE&
     257             :       &_&
     258             :       &PRECISION&
     259           0 :       & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
     260           0 :     end subroutine
     261             : 
     262             :     ! -------------------------------------------
     263             :     ! Fortran back-transformation support kernels
     264             :     ! -------------------------------------------
     265             : 
     266             :     ! Reset a reduction block
     267             :     ! Limitation: the thread-block size must be a divider of the reduction block's size
     268             :     ! Reset 2 reduction blocks without an explicit synchronization at the end
     269             :     ! Limitation: : the thread-block size must be a divider of the reduction block's size
     270             :     ! Perform a reduction on an initialized, 128-element shared block
     271             :     ! Compute the dot-product between 2 consecutive HH vectors
     272             :     ! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
     273             :     ! Limitation 2: the size of the warp must be equal to 32
     274             :     !
     275             :     ! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
     276             :     ! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
     277             :     !
     278             :     ! -------------------------------------------
     279             :     ! Fortran back-transformation support kernels
     280             :     ! -------------------------------------------
     281             :     !
     282             :     ! This is the simplest and slowest available backtransformation kernel
     283             :     !
     284             :     ! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
     285             :     ! 2 Householder reflectors per iteration
     286             :     !
     287             :     ! ---------------------------------
     288             :     ! Row packing and unpacking kernels
     289             :     ! ---------------------------------
     290             :     !
     291             :     ! The row group packing kernel
     292             : 
     293             :         ! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
     294             :         ! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
     295             :         ! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
     296             :         ! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
     297             : 

Generated by: LCOV version 1.12