LCOV - code coverage report
Current view: top level - src/elpa1 - elpa1_trans_ev_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 117 167 70.1 %
Date: 2018-01-10 09:29:53 Functions: 4 4 100.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : !
      47             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      48             : !
      49             : ! Copyright of the original code rests with the authors inside the ELPA
      50             : ! consortium. The copyright of any additional modifications shall rest
      51             : ! with their original authors, but shall adhere to the licensing terms
      52             : ! distributed along with the original code in the file "COPYING".
      53             : #endif
      54             : 
      55             : #include "../general/sanity.F90"
      56             : 
      57             : !> \brief Transforms the eigenvectors of a tridiagonal matrix back
      58             : !>                     to the eigenvectors of the original matrix
      59             : !>                     (like Scalapack Routine PDORMTR)
      60             : !>
      61             : !  Parameters
      62             : !
      63             : !> \param na          Order of matrix a_mat, number of rows of matrix q_mat
      64             : !>
      65             : !> \param nqc         Number of columns of matrix q_mat
      66             : !>
      67             : !> \param a_mat(lda,matrixCols)  Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
      68             : !>                           Distribution is like in Scalapack.
      69             : !>
      70             : !> \param lda         Leading dimension of a_mat
      71             : !>
      72             : !> \param tau(na)     Factors of the Householder vectors
      73             : !>
      74             : !> \param q_mat           On input: Eigenvectors of tridiagonal matrix
      75             : !>                    On output: Transformed eigenvectors
      76             : !>                    Distribution is like in Scalapack.
      77             : !>
      78             : !> \param ldq         Leading dimension of q_mat
      79             : !>
      80             : !> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
      81             : !>
      82             : !> \param matrixCols  local columns of matrix a_mat and q_mat
      83             : !>
      84             : !> \param mpi_comm_rows        MPI-Communicator for rows
      85             : !>
      86             : !> \param mpi_comm_cols        MPI-Communicator for columns
      87             : !>
      88             : !> \param useGPU      If true,  GPU version of the subroutine will be used
      89             : !>
      90             : 
      91             :     subroutine trans_ev_&
      92             :     &MATH_DATATYPE&
      93             :     &_&
      94        8064 :     &PRECISION &
      95        8064 :     (obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
      96             :       use cuda_functions
      97             :       use iso_c_binding
      98             :       use precision
      99             :       use elpa_abstract_impl
     100             :       implicit none
     101             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     102             :       integer(kind=ik), intent(in)                  :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
     103             : #if REALCASE == 1
     104             :       real(kind=REAL_DATATYPE), intent(in)          :: tau(na)
     105             : #endif
     106             : #if COMPLEXCASE == 1
     107             :       complex(kind=COMPLEX_DATATYPE), intent(in)    :: tau(na)
     108             : #endif
     109             : 
     110             : #if REALCASE == 1
     111             : #ifdef USE_ASSUMED_SIZE
     112             :       real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*), q_mat(ldq,*)
     113             : #else
     114             :       real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
     115             : #endif
     116             : #endif
     117             : #if COMPLEXCASE == 1
     118             : #ifdef USE_ASSUMED_SIZE
     119             :       complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
     120             : #else
     121             :       complex(kind=COMPLEX_DATATYPE), intent(inout) ::  a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
     122             : #endif
     123             : #endif
     124             :       logical, intent(in)                           :: useGPU
     125             : 
     126             :       integer(kind=ik)                              :: max_stored_rows
     127             : 
     128             : #if REALCASE == 1
     129             : #ifdef DOUBLE_PRECISION_REAL
     130             :       real(kind=rk8), parameter                     :: ZERO = 0.0_rk8, ONE = 1.0_rk8
     131             : #else
     132             :       real(kind=rk4), parameter                     :: ZERO = 0.0_rk4, ONE = 1.0_rk4
     133             : #endif
     134             : #endif
     135             : #if COMPLEXCASE == 1
     136             : #ifdef DOUBLE_PRECISION_COMPLEX
     137             :       complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
     138             : #else
     139             :       complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
     140             : #endif
     141             : #endif
     142             :       integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, mpierr
     143             :       integer(kind=ik)                              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
     144             :       integer(kind=ik)                              :: l_cols, l_rows, l_colh, nstor
     145             :       integer(kind=ik)                              :: istep, n, nc, ic, ics, ice, nb, cur_pcol
     146             :       integer(kind=ik)                              :: hvn_ubnd, hvm_ubnd
     147             : 
     148             : #if REALCASE == 1
     149        8064 :       real(kind=REAL_DATATYPE), allocatable         :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
     150        8064 :       real(kind=REAL_DATATYPE), allocatable         :: tmat(:,:), h1(:), h2(:), hvm1(:)
     151             : #endif
     152             : #if COMPLEXCASE == 1
     153        8064 :       complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
     154        8064 :       complex(kind=COMPLEX_DATATYPE), allocatable   :: tmat(:,:), h1(:), h2(:), hvm1(:)
     155             : #endif
     156             :       integer(kind=ik)                              :: istat
     157             :       character(200)                                :: errorMessage
     158             : 
     159             :       integer(kind=C_intptr_T)                      :: q_dev, tmp_dev, hvm_dev, tmat_dev
     160             :       logical                                       :: successCUDA
     161             :       integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
     162             :                                                                           &PRECISION&
     163             :                                                                           &_&
     164             :                                                                           &MATH_DATATYPE
     165             : 
     166             :       call obj%timer%start("trans_ev_&
     167             :       &MATH_DATATYPE&
     168             :       &" // &
     169             :       &PRECISION_SUFFIX &
     170        8064 :       )
     171             : 
     172        8064 :       call obj%timer%start("mpi_communication")
     173        8064 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     174        8064 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     175        8064 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     176        8064 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     177        8064 :       call obj%timer%stop("mpi_communication")
     178             : 
     179        8064 :       totalblocks = (na-1)/nblk + 1
     180        8064 :       max_blocks_row = (totalblocks-1)/np_rows + 1
     181        8064 :       max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
     182             : 
     183        8064 :       max_local_rows = max_blocks_row*nblk
     184        8064 :       max_local_cols = max_blocks_col*nblk
     185             : 
     186        8064 :       max_stored_rows = (63/nblk+1)*nblk
     187             : 
     188        8064 :       allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
     189             :       call check_alloc("trans_ev_&
     190             :       &MATH_DATATYPE&
     191        8064 :       &", "tmat", istat, errorMessage)
     192             : 
     193        8064 :       allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
     194             :       call check_alloc("trans_ev_&
     195             :       &MATH_DATATYPE&
     196        8064 :       &", "h1", istat, errorMessage)
     197             : 
     198        8064 :       allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
     199             :       call check_alloc("trans_ev_&
     200             :       &MATH_DATATYPE&
     201        8064 :       &", "h2", istat, errorMessage)
     202             : 
     203        8064 :       allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
     204             :       call check_alloc("trans_ev_&
     205             :       &MATH_DATATYPE&
     206        8064 :       &", "tmp1", istat, errorMessage)
     207             : 
     208        8064 :       allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
     209             :       call check_alloc("trans_ev_&
     210             :       &MATH_DATATYPE&
     211        8064 :       &", "tmp2", istat, errorMessage)
     212             : 
     213        8064 :       allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
     214             :       call check_alloc("trans_ev_&
     215             :       &MATH_DATATYPE&
     216        8064 :       &", "hvn", istat, errorMessage)
     217             : 
     218        8064 :       allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
     219             :       call check_alloc("trans_ev_&
     220             :       &MATH_DATATYPE&
     221        8064 :       &", "hvm", istat, errorMessage)
     222             : 
     223        8064 :       hvm = 0   ! Must be set to 0 !!!
     224        8064 :       hvb = 0   ! Safety only
     225             : 
     226        8064 :       l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
     227             : 
     228        8064 :       nstor = 0
     229        8064 :       if (useGPU) then
     230           0 :         hvn_ubnd = 0
     231             :       endif
     232             : 
     233             : #if COMPLEXCASE == 1
     234             :       ! In the complex case tau(2) /= 0
     235        4032 :       if (my_prow == prow(1, nblk, np_rows)) then
     236        2688 :         q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
     237             :       endif
     238             : #endif
     239             : 
     240        8064 :       if (useGPU) then
     241             :         ! todo: this is used only for copying hmv to device.. it should be possible to go without it
     242           0 :         allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
     243             :         call check_alloc("trans_ev_&
     244             :         &MATH_DATATYPE&
     245           0 :         &", "hvm1", istat, errorMessage)
     246             : 
     247           0 :         successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
     248           0 :         check_alloc_cuda("trans_ev", successCUDA)
     249             : 
     250           0 :         successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
     251           0 :         check_alloc_cuda("trans_ev", successCUDA)
     252             : 
     253           0 :         successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
     254           0 :         check_alloc_cuda("trans_ev", successCUDA)
     255             : 
     256           0 :         successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_datatype)
     257           0 :         check_alloc_cuda("trans_ev", successCUDA)
     258             : 
     259             : !         q_dev = q_mat
     260           0 :         successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
     261           0 :         check_memcpy_cuda("trans_ev", successCUDA)
     262             :       endif  ! useGPU
     263             : 
     264      165024 :       do istep = 1, na, nblk
     265      156960 :         ics = MAX(istep,3)
     266      156960 :         ice = MIN(istep+nblk-1,na)
     267      156960 :         if (ice<ics) cycle
     268             : 
     269      156960 :         cur_pcol = pcol(istep, nblk, np_cols)
     270             : 
     271      156960 :         nb = 0
     272     2574432 :         do ic = ics, ice
     273             : 
     274     2417472 :           l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder Vector
     275     2417472 :           l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
     276             : 
     277             : 
     278     2417472 :           if (my_pcol == cur_pcol) then
     279     2417472 :             hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
     280     2417472 :             if (my_prow == prow(ic-1, nblk, np_rows)) then
     281     1611648 :               hvb(nb+l_rows) = 1.
     282             :             endif
     283             :           endif
     284             : 
     285     2417472 :           nb = nb+l_rows
     286             :         enddo
     287             : 
     288             : #ifdef WITH_MPI
     289      104640 :         call obj%timer%start("mpi_communication")
     290      104640 :         if (nb>0) &
     291             :           call MPI_Bcast(hvb, nb, &
     292             : #if REALCASE == 1
     293             :           &MPI_REAL_PRECISION&
     294             : #endif
     295             : 
     296             : #if COMPLEXCASE == 1
     297             :           &MPI_COMPLEX_PRECISION&
     298             : #endif
     299      101952 :           , cur_pcol, mpi_comm_cols, mpierr)
     300      104640 :         call obj%timer%stop("mpi_communication")
     301             : #endif /* WITH_MPI */
     302             : 
     303      156960 :         nb = 0
     304     2574432 :         do ic = ics, ice
     305     2417472 :           l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
     306     2417472 :           hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
     307     2417472 :           if (useGPU) then
     308           0 :             hvm_ubnd = l_rows
     309             :           endif
     310     2417472 :           nstor = nstor+1
     311     2417472 :           nb = nb+l_rows
     312             :         enddo
     313             : 
     314             :         ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
     315      156960 :         if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then
     316             : 
     317             :           ! Calculate scalar products of stored vectors.
     318             :           ! This can be done in different ways, we use dsyrk or zherk
     319             : 
     320       56160 :           tmat = 0
     321       56160 :           call obj%timer%start("blas")
     322       56160 :           if (l_rows>0) &
     323             : #if REALCASE == 1
     324             :             call PRECISION_SYRK('U', 'T',   &
     325             : #endif
     326             : #if COMPLEXCASE == 1
     327             :             call PRECISION_HERK('U', 'C',   &
     328             : #endif
     329       56160 :                                  nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
     330       56160 :           call obj%timer%stop("blas")
     331       56160 :           nc = 0
     332     2417472 :           do n = 1, nstor-1
     333     2361312 :             h1(nc+1:nc+n) = tmat(1:n,n+1)
     334     2361312 :             nc = nc+n
     335             :           enddo
     336             : #ifdef WITH_MPI
     337       37440 :           call obj%timer%start("mpi_communication")
     338       37440 :           if (nc>0) call mpi_allreduce( h1, h2, nc, &
     339             : #if REALCASE == 1
     340             :           &MPI_REAL_PRECISION&
     341             : #endif
     342             : #if COMPLEXCASE == 1
     343             :           &MPI_COMPLEX_PRECISION&
     344             : #endif
     345       37440 :           &, MPI_SUM, mpi_comm_rows, mpierr)
     346       37440 :           call obj%timer%stop("mpi_communication")
     347             : #else /* WITH_MPI */
     348             : 
     349       18720 :           if (nc > 0) h2 = h1
     350             : 
     351             : #endif /* WITH_MPI */
     352             :           ! Calculate triangular matrix T
     353             : 
     354       56160 :           nc = 0
     355       56160 :           tmat(1,1) = tau(ice-nstor+1)
     356     2417472 :           do n = 1, nstor-1
     357     2361312 :             call obj%timer%start("blas")
     358             : #if REALCASE == 1
     359             :             call PRECISION_TRMV('L', 'T', 'N',    &
     360             : #endif
     361             : #if COMPLEXCASE == 1
     362             :             call PRECISION_TRMV('L', 'C', 'N',    &
     363             : #endif
     364     2361312 :                                 n, tmat, max_stored_rows, h2(nc+1), 1)
     365     2361312 :             call obj%timer%stop("blas")
     366             : 
     367             :             tmat(n+1,1:n) = &
     368             : #if REALCASE == 1
     369             :             -h2(nc+1:nc+n)  &
     370             : #endif
     371             : #if COMPLEXCASE == 1
     372             :             -conjg(h2(nc+1:nc+n)) &
     373             : #endif
     374     2361312 :             *tau(ice-nstor+n+1)
     375             : 
     376     2361312 :             tmat(n+1,n+1) = tau(ice-nstor+n+1)
     377     2361312 :             nc = nc+n
     378             :           enddo
     379             : 
     380       56160 :           if (useGPU) then
     381             :             ! todo: is this reshape really neccessary?
     382           0 :             hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
     383             : 
     384             :             !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
     385             :             successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)),   &
     386           0 :                           hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
     387             : 
     388           0 :             check_memcpy_cuda("trans_ev", successCUDA)
     389             : 
     390             :             !tmat_dev = tmat
     391             :             successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)),   &
     392           0 :                           max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
     393           0 :             check_memcpy_cuda("trans_ev", successCUDA)
     394             :           endif
     395             : 
     396             :           ! Q = Q - V * T * V**T * Q
     397             : 
     398       56160 :           if (l_rows>0) then
     399       56160 :             if (useGPU) then
     400           0 :               call obj%timer%start("cublas")
     401             : #if REALCASE == 1
     402             :               call cublas_PRECISION_GEMM('T', 'N',     &
     403             : #endif
     404             : #if COMPLEXCASE == 1
     405             :               call cublas_PRECISION_GEMM('C', 'N',     &
     406             : #endif
     407             :                                          nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd,  &
     408           0 :                                          q_dev, ldq,  ZERO, tmp_dev, nstor)
     409           0 :               call obj%timer%stop("cublas")
     410             : 
     411             :             else ! useGPU
     412             : 
     413       56160 :               call obj%timer%start("blas")
     414             : #if REALCASE == 1
     415             :               call PRECISION_GEMM('T', 'N',           &
     416             : #endif
     417             : #if COMPLEXCASE == 1
     418             :               call PRECISION_GEMM('C', 'N',           &
     419             : #endif
     420             :                                   nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
     421       56160 :                                   q_mat, ldq, ZERO, tmp1, nstor)
     422       56160 :               call obj%timer%stop("blas")
     423             :             endif ! useGPU
     424             : 
     425             :           else !l_rows>0
     426             : 
     427           0 :             if (useGPU) then
     428           0 :               successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
     429           0 :               check_memcpy_cuda("trans_ev", successCUDA)
     430             :             else
     431           0 :               tmp1(1:l_cols*nstor) = 0
     432             :             endif
     433             :           endif  !l_rows>0
     434             : 
     435             : #ifdef WITH_MPI
     436             :           ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
     437             :           ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
     438       37440 :           if (useGPU) then
     439             :             successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev,  &
     440           0 :                           max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
     441           0 :             check_memcpy_cuda("trans_ev", successCUDA)
     442             :           endif
     443       37440 :           call obj%timer%start("mpi_communication")
     444             :           call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
     445             : #if REALCASE == 1
     446             :           &MPI_REAL_PRECISION&
     447             : #endif
     448             : #if COMPLEXCASE == 1
     449             :           &MPI_COMPLEX_PRECISION&
     450             : #endif
     451       37440 :           &, MPI_SUM, mpi_comm_rows, mpierr)
     452       37440 :           call obj%timer%stop("mpi_communication")
     453             :           ! copy back tmp2 - after reduction...
     454       37440 :           if (useGPU) then
     455             :             successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)),  &
     456           0 :                           max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
     457           0 :             check_memcpy_cuda("trans_ev", successCUDA)
     458             :           endif ! useGPU
     459             : 
     460             : 
     461             : #else /* WITH_MPI */
     462             : !          tmp2 = tmp1
     463             : #endif /* WITH_MPI */
     464             : 
     465       56160 :           if (l_rows>0) then
     466       56160 :             if (useGPU) then
     467           0 :               call obj%timer%start("cublas")
     468             :               call cublas_PRECISION_TRMM('L', 'L', 'N', 'N',     &
     469             :                                          nstor, l_cols, ONE, tmat_dev, max_stored_rows,  &
     470           0 :                                          tmp_dev, nstor)
     471             : 
     472             :               call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor,  &
     473             :                                          -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor,   &
     474           0 :                                          ONE, q_dev, ldq)
     475           0 :               call obj%timer%stop("cublas")
     476             :             else !useGPU
     477             : #ifdef WITH_MPI
     478             :               ! tmp2 = tmat * tmp2
     479       37440 :               call obj%timer%start("blas")
     480             :               call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
     481       37440 :                                   ONE, tmat, max_stored_rows, tmp2, nstor)
     482             :               !q_mat = q_mat - hvm*tmp2
     483             :               call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
     484       37440 :                                   -ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
     485       37440 :               call obj%timer%stop("blas")
     486             : #else /* WITH_MPI */
     487       18720 :               call obj%timer%start("blas")
     488             : 
     489             :               call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
     490       18720 :                                   ONE, tmat, max_stored_rows, tmp1, nstor)
     491             :               call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
     492       18720 :                                   -ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
     493       18720 :               call obj%timer%stop("blas")
     494             : #endif /* WITH_MPI */
     495             :             endif ! useGPU
     496             :           endif  ! l_rows>0
     497       56160 :           nstor = 0
     498             :         endif  ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))
     499             : 
     500             :       enddo ! istep=1,na,nblk
     501             : 
     502        8064 :       deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
     503        8064 :       if (istat .ne. 0) then
     504             :         print *,"trans_ev_&
     505             :         &MATH_DATATYPE&
     506           0 :         &: error when deallocating hvm "//errorMessage
     507           0 :         stop 1
     508             :       endif
     509             : 
     510        8064 :       if (useGPU) then
     511             :         !q_mat = q_dev
     512           0 :         successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
     513           0 :         check_memcpy_cuda("trans_ev", successCUDA)
     514             : 
     515           0 :         deallocate(hvm1, stat=istat, errmsg=errorMessage)
     516           0 :         if (istat .ne. 0) then
     517             :           print *,"trans_ev_&
     518             :           &MATH_DATATYPE&
     519           0 :           &: error when deallocating hvm1 "//errorMessage
     520           0 :           stop 1
     521             :         endif
     522             : 
     523             :         !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
     524           0 :         successCUDA = cuda_free(q_dev)
     525           0 :         check_dealloc_cuda("trans_ev", successCUDA)
     526             : 
     527           0 :         successCUDA = cuda_free(tmp_dev)
     528           0 :         check_dealloc_cuda("trans_ev", successCUDA)
     529             : 
     530           0 :         successCUDA = cuda_free(hvm_dev)
     531           0 :         check_dealloc_cuda("trans_ev", successCUDA)
     532             : 
     533           0 :         successCUDA = cuda_free(tmat_dev)
     534           0 :         check_dealloc_cuda("trans_ev", successCUDA)
     535             : 
     536             :       endif
     537             : 
     538             :       call obj%timer%stop("trans_ev_&
     539             :       &MATH_DATATYPE&
     540             :       &" // &
     541             :       &PRECISION_SUFFIX&
     542        8064 :       )
     543             : 
     544             :     end subroutine trans_ev_&
     545             :     &MATH_DATATYPE&
     546             :     &_&
     547        8064 :     &PRECISION

Generated by: LCOV version 1.12