LCOV - code coverage report
Current view: top level - src/elpa1 - elpa1_tridiag_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 249 330 75.5 %
Date: 2018-01-10 09:29:53 Functions: 10 10 100.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : !
      47             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      48             : !
      49             : ! Copyright of the original code rests with the authors inside the ELPA
      50             : ! consortium. The copyright of any additional modifications shall rest
      51             : ! with their original authors, but shall adhere to the licensing terms
      52             : ! distributed along with the original code in the file "COPYING".
      53             : #endif
      54             : 
      55             : #include "../general/sanity.F90"
      56             : 
      57             : #undef SAVE_MATR
      58             : #ifdef DOUBLE_PRECISION_REAL
      59             : #define SAVE_MATR(name, iteration) \
      60             : call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
      61             : #else
      62             : #define SAVE_MATR(name, iteration)
      63             : #endif
      64             : 
      65             : !> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
      66             : !>
      67             : !  Parameters
      68             : !
      69             : !> \param obj              object of elpa_type
      70             : !> \param na          Order of matrix
      71             : !>
      72             : !> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
      73             : !>              Distribution is like in Scalapack.
      74             : !>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
      75             : !>              a(:,:) is overwritten on exit with the Householder vectors
      76             : !>
      77             : !> \param lda         Leading dimension of a
      78             : !>
      79             : !> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
      80             : !>
      81             : !> \param matrixCols  local columns of matrix
      82             : !>
      83             : !> \param mpi_comm_rows        MPI-Communicator for rows
      84             : !> \param mpi_comm_cols        MPI-Communicator for columns
      85             : !>
      86             : !> \param d_vec(na)       Diagonal elements (returned), identical on all processors
      87             : !>
      88             : !> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
      89             : !>
      90             : !> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
      91             : !>
      92             : !> \param useGPU      If true,  GPU version of the subroutine will be used
      93             : !> \param wantDebug   if true more debug information
      94             : !>
      95             :     subroutine tridiag_&
      96             :     &MATH_DATATYPE&
      97             :     &_&
      98        9216 :     &PRECISION &
      99        9216 :     (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug)
     100             :       use cuda_functions
     101             :       use iso_c_binding
     102             :       use precision
     103             :       use elpa_abstract_impl
     104             :       use matrix_plot
     105             :       implicit none
     106             : #include "../general/precision_kinds.F90"
     107             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     108             :       integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
     109             :       logical, intent(in)                           :: useGPU, wantDebug
     110             : 
     111             : #if REALCASE == 1
     112             :       real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
     113             : #endif
     114             : #if COMPLEXCASE == 1
     115             :       complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
     116             : #endif
     117             : #if REALCASE == 1
     118             : #ifdef USE_ASSUMED_SIZE
     119             :       real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
     120             : #else
     121             :       real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
     122             : #endif
     123             : #endif
     124             : #if COMPLEXCASE == 1
     125             : #ifdef USE_ASSUMED_SIZE
     126             :       complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
     127             : #else
     128             :       complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
     129             : #endif
     130             : #endif
     131             :       real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
     132             : 
     133             :       integer(kind=ik), parameter                   :: max_stored_uv = 32
     134             :       logical,          parameter                   :: mat_vec_as_one_block = .true.
     135             : 
     136             :       ! id in processor row and column and total numbers of processor rows and columns
     137             :       integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
     138             :       integer(kind=ik)                              :: mpierr
     139             :       integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
     140             :                                                        max_local_cols
     141             :       ! updated after each istep (in the main cycle) to contain number of
     142             :       ! local columns and rows of the remaining part of the matrix
     143             :       !integer(kind=ik)                             :: l_cols, l_rows
     144             :       integer(kind=ik)                              :: l_cols, l_rows
     145             :       integer(kind=ik)                              :: n_stored_vecs
     146             : 
     147             :       integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
     148             :                                                        uv_stored_cols_dev
     149             :       logical                                       :: successCUDA
     150             : 
     151             :       integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
     152             :       integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
     153             :       integer(kind=c_intptr_t)                        :: a_offset
     154             : 
     155             : #ifdef WITH_OPENMP
     156             :       integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
     157             :       integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
     158             : #endif
     159             : 
     160             :       real(kind=REAL_DATATYPE)                      :: vnorm2
     161             : #if REALCASE == 1
     162             :       real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
     163             : #endif
     164             : #if COMPLEXCASE == 1
     165             :       complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
     166             : #endif
     167             : 
     168             : #if REALCASE == 1
     169        4752 :       real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
     170        4752 :                                                        v_row(:), &   ! used to store calculated Householder Vector
     171        4752 :                                                        v_col(:), &   ! the same Vector, but transposed - differently distributed among MPI tasks
     172        4752 :                                                        u_row(:), &
     173        4752 :                                                        u_col(:)
     174             : #endif
     175             : #if COMPLEXCASE == 1
     176       13392 :       complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
     177             : #endif
     178             :       ! the following two matrices store pairs of vectors v and u calculated in each step
     179             :       ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
     180             :       ! u and v are stored both in row and Vector forms
     181             :       ! pattern: v1,u1,v2,u2,v3,u3,....
     182             :       ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
     183             : #if REALCASE == 1
     184        4752 :       real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
     185             :       ! pattern: u1,v1,u2,v2,u3,v3,....
     186        4752 :       real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
     187             : #endif
     188             : #if COMPLEXCASE == 1
     189        8928 :       complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
     190             : #endif
     191             : 
     192             : #ifdef WITH_OPENMP
     193             : #if REALCASE == 1
     194        2376 :       real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
     195             : #endif
     196             : #if COMPLEXCASE == 1
     197        2232 :       complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
     198             : #endif
     199             : #endif
     200             : 
     201             : #if COMPLEXCASE == 1
     202        4464 :       real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
     203             : #endif
     204             :       integer(kind=ik)                              :: istat
     205             :       character(200)                                :: errorMessage
     206             :       integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
     207             :                                                                           &PRECISION&
     208             :                                                                           &_&
     209             :                                                                           &MATH_DATATYPE
     210             :       call obj%timer%start("tridiag_&
     211             :       &MATH_DATATYPE&
     212             :       &" // &
     213             :       PRECISION_SUFFIX &
     214        9216 :       )
     215             : 
     216             : 
     217        9216 :       if (wantDebug) call obj%timer%start("mpi_communication")
     218        9216 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     219        9216 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     220        9216 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     221        9216 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     222        9216 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     223             : 
     224             :       ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
     225             :       ! seems that tile is a square submatrix, consisting by several blocks
     226             :       ! it is a smallest possible square submatrix, where blocks being distributed among
     227             :       ! processors are "aligned" in both rows and columns
     228             :       !  -----------------
     229             :       ! | 1 4 | 1 4 | 1 4 | ...
     230             :       ! | 2 5 | 2 5 | 2 5 | ...
     231             :       ! | 3 6 | 3 6 | 3 6 | ...
     232             :       !  ----------------- ...
     233             :       ! | 1 4 | 1 4 | 1 4 | ...
     234             :       ! | 2 5 | 2 5 | 2 5 | ...
     235             :       ! | 3 6 | 3 6 | 3 6 | ...
     236             :       !  ----------------- .
     237             :       !   : :   : :   : :    .
     238             :       !   : :   : :   : :      .
     239             :       !
     240             :       ! this is a tile, where each number represents block, assigned to a processor with the shown number
     241             :       ! size of this small block is nblk
     242             :       ! Image is for situation with 6 processors, 3 processor rows and 2 columns
     243             :       ! tile_size is thus nblk * 6
     244             :       !
     245        9216 :       tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
     246        9216 :       tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
     247             : 
     248        9216 :       l_rows_per_tile = tile_size/np_rows ! local rows of a tile
     249        9216 :       l_cols_per_tile = tile_size/np_cols ! local cols of a tile
     250             : 
     251        9216 :       totalblocks = (na-1)/nblk + 1
     252        9216 :       max_loc_block_rows = (totalblocks-1)/np_rows + 1
     253        9216 :       max_loc_block_cols = (totalblocks-1)/np_cols + 1
     254             : 
     255             :       ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
     256        9216 :       max_local_rows = max_loc_block_rows*nblk
     257        9216 :       max_local_cols = max_loc_block_cols*nblk
     258             : 
     259             :       ! allocate memmory for vectors
     260             :       ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
     261             :       ! todo: if something has length max_local_rows, it is actually a column, no?
     262             :       ! todo: probably one should read it as v_row = Vector v distributed among rows
     263             :       !
     264        9216 :       allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
     265             :       call check_alloc("tridiag_&
     266        9216 :       &MATH_DATATYPE ", "tmp", istat, errorMessage)
     267             : 
     268             :       ! allocate v_row 1 element longer to allow store and broadcast tau together with it
     269        9216 :       allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
     270             :       call check_alloc("tridiag_&
     271        9216 :       &MATH_DATATYPE ", "v_row", istat, errorMessage)
     272             : 
     273        9216 :       allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
     274             :       call check_alloc("tridiag_&
     275        9216 :       &MATH_DATATYPE ", "u_row", istat, errorMessage)
     276             : 
     277        9216 :       allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
     278             :       call check_alloc("tridiag_&
     279        9216 :       &MATH_DATATYPE ", "v_col", istat, errorMessage)
     280             : 
     281        9216 :       allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
     282             :       call check_alloc("tridiag_&
     283        9216 :       &MATH_DATATYPE ", "u_col", istat, errorMessage)
     284             : 
     285             : #ifdef WITH_OPENMP
     286        4608 :       max_threads = omp_get_max_threads()
     287             : 
     288        4608 :       allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
     289             :       call check_alloc("tridiag_&
     290        4608 :       &MATH_DATATYPE ", "ur_p", istat, errorMessage)
     291             : 
     292        4608 :       allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
     293             :       call check_alloc("tridiag_&
     294        4608 :       &MATH_DATATYPE ", "uc_p", istat, errorMessage)
     295             : #endif /* WITH_OPENMP */
     296             : 
     297        9216 :       tmp = 0
     298        9216 :       v_row = 0
     299        9216 :       u_row = 0
     300        9216 :       v_col = 0
     301        9216 :       u_col = 0
     302             : 
     303        9216 :       allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
     304             :       call check_alloc("tridiag_&
     305        9216 :       &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
     306             : 
     307        9216 :       allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
     308             :       call check_alloc("tridiag_&
     309        9216 :       &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
     310             : 
     311        9216 :       if (useGPU) then
     312           0 :          successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
     313           0 :          check_alloc_cuda("tridiag: v_row_dev", successCUDA)
     314             : 
     315           0 :          successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
     316             : 
     317           0 :          check_alloc_cuda("tridiag: u_row_dev", successCUDA)
     318             : 
     319           0 :          successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
     320           0 :          check_alloc_cuda("tridiag: v_col_dev", successCUDA)
     321             : 
     322           0 :          successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
     323           0 :          check_alloc_cuda("tridiag: u_col_dev", successCUDA)
     324             : 
     325           0 :          successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
     326           0 :          check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
     327             : 
     328           0 :          successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
     329           0 :          check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
     330             :       endif !useGPU
     331             : 
     332             : 
     333        9216 :       d_vec(:) = 0
     334        9216 :       e_vec(:) = 0
     335        9216 :       tau(:) = 0
     336             : 
     337        9216 :       n_stored_vecs = 0
     338             : 
     339        9216 :       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
     340        9216 :       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
     341             : 
     342        9216 :       if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
     343             : #if COMPLEXCASE == 1
     344        2976 :         d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
     345             : #endif
     346             : #if REALCASE == 1
     347        3168 :         d_vec(na) = a_mat(l_rows,l_cols)
     348             : #endif
     349             : 
     350        9216 :       if (useGPU) then
     351             :         ! allocate memmory for matrix A on the device and than copy the matrix
     352             : 
     353           0 :         successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
     354           0 :         check_alloc_cuda("tridiag: a_dev", successCUDA)
     355             : 
     356           0 :         successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
     357           0 :         check_memcpy_cuda("tridiag: a_dev", successCUDA)
     358             :       endif
     359             : 
     360             :       ! main cycle of tridiagonalization
     361             :       ! in each step, 1 Householder Vector is calculated
     362     2597184 :       do istep = na, 3 ,-1
     363             : 
     364             :         ! Calculate number of local rows and columns of the still remaining matrix
     365             :         ! on the local processor
     366     2587968 :         l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
     367     2587968 :         l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
     368             : 
     369             :         ! Calculate Vector for Householder transformation on all procs
     370             :         ! owning column istep
     371             : 
     372     2587968 :         if (my_pcol == pcol(istep, nblk, np_cols)) then
     373             : 
     374             :           ! Get Vector to be transformed; distribute last element and norm of
     375             :           ! remaining elements to all procs in current column
     376             : 
     377             :           ! copy l_cols + 1 column of A to v_row
     378     2587968 :           if (useGPU) then
     379           0 :             a_offset = l_cols * lda * size_of_datatype
     380             :             ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
     381             : 
     382           0 :             successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
     383           0 :             check_memcpy_cuda("tridiag a_dev 1", successCUDA)
     384             :           else
     385     2587968 :             v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
     386             :           endif
     387             : 
     388     2587968 :             if (n_stored_vecs > 0 .and. l_rows > 0) then
     389     2457408 :               if (wantDebug) call obj%timer%start("blas")
     390             : #if COMPLEXCASE == 1
     391     1208832 :               aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
     392             : #endif
     393             :               call PRECISION_GEMV('N',   &
     394             :                                   l_rows, 2*n_stored_vecs,                                  &
     395             :                                   ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
     396             : #if REALCASE == 1
     397             :                                   uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
     398             : #endif
     399             : #if COMPLEXCASE == 1
     400             :                                    aux, 1,  &
     401             : 
     402             : #endif
     403     2457408 :                                   ONE, v_row, 1)
     404     2457408 :                if (wantDebug) call obj%timer%stop("blas")
     405             : 
     406             :             endif
     407             : 
     408     2587968 :             if(my_prow == prow(istep-1, nblk, np_rows)) then
     409     1725312 :                aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
     410     1725312 :                aux1(2) = v_row(l_rows)
     411             :             else
     412      862656 :                aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
     413      862656 :                aux1(2) = 0.
     414             :             endif
     415             : 
     416             : #ifdef WITH_MPI
     417     1725312 :             if (wantDebug) call obj%timer%start("mpi_communication")
     418             :             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
     419     1725312 :                                MPI_SUM, mpi_comm_rows, mpierr)
     420     1725312 :             if (wantDebug) call obj%timer%stop("mpi_communication")
     421             : #else /* WITH_MPI */
     422      862656 :           aux2 = aux1
     423             : #endif /* WITH_MPI */
     424             : 
     425             : #if REALCASE == 1
     426     1315296 :           vnorm2 = aux2(1)
     427             : #endif
     428             : #if COMPLEXCASE == 1
     429     1272672 :           vnorm2 = real(aux2(1),kind=rk)
     430             : #endif
     431     2587968 :           vrl    = aux2(2)
     432             : 
     433             :           ! Householder transformation
     434             : #if REALCASE == 1
     435             :           call hh_transform_real_&
     436             : #endif
     437             : #if COMPLEXCASE == 1
     438             :           call hh_transform_complex_&
     439             : #endif
     440             :                &PRECISION &
     441     2587968 :                              (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
     442             :           ! Scale v_row and store Householder Vector for back transformation
     443             : 
     444     2587968 :           v_row(1:l_rows) = v_row(1:l_rows) * xf
     445     2587968 :           if (my_prow == prow(istep-1, nblk, np_rows)) then
     446     1725312 :             v_row(l_rows) = 1.
     447             : 
     448             :             ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
     449             : #if REALCASE == 1
     450      876864 :             e_vec(istep-1) = vrl
     451             : #endif
     452             : #if COMPLEXCASE == 1
     453      848448 :             e_vec(istep-1) = real(vrl,kind=rk)
     454             : #endif
     455             :           endif
     456             : 
     457             :           ! store Householder Vector for back transformation
     458     2587968 :           a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
     459             : 
     460             :           ! add tau after the end of actuall v_row, to be broadcasted with it
     461     2587968 :           v_row(l_rows+1) = tau(istep)
     462             :          endif !(my_pcol == pcol(istep, nblk, np_cols))
     463             : 
     464             : !          SAVE_MATR("HH vec stored", na - istep + 1)
     465             : 
     466             : #ifdef WITH_MPI
     467     1725312 :          if (wantDebug) call obj%timer%start("mpi_communication")
     468             :          ! Broadcast the Householder Vector (and tau) along columns
     469             :          call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
     470     1725312 :                         pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
     471     1725312 :          if (wantDebug) call obj%timer%stop("mpi_communication")
     472             : #endif /* WITH_MPI */
     473             : 
     474             :         !recover tau, which has been broadcasted together with v_row
     475     2587968 :         tau(istep) =  v_row(l_rows+1)
     476             : 
     477             :         ! Transpose Householder Vector v_row -> v_col
     478             :         call elpa_transpose_vectors_&
     479             :              &MATH_DATATYPE&
     480             :              &_&
     481             :              &PRECISION &
     482             :                    (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
     483     2587968 :                     1, istep-1, 1, nblk)
     484             : 
     485             :         ! Calculate u = (A + VU**T + UV**T)*v
     486             : 
     487             :         ! For cache efficiency, we use only the upper half of the matrix tiles for this,
     488             :         ! thus the result is partly in u_col(:) and partly in u_row(:)
     489             : 
     490     2587968 :         u_col(1:l_cols) = 0
     491     2587968 :         u_row(1:l_rows) = 0
     492     2587968 :         if (l_rows > 0 .and. l_cols> 0 ) then
     493     2541888 :           if(useGPU) then
     494           0 :             successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
     495           0 :             check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
     496             : 
     497           0 :             successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
     498           0 :             check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
     499             : 
     500           0 :             successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
     501             : 
     502           0 :             check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
     503             : 
     504           0 :             successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
     505           0 :             check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
     506             :           endif ! useGU
     507             : 
     508             : #ifdef WITH_OPENMP
     509     1270944 :           call obj%timer%start("OpenMP parallel")
     510     2541888 : !$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
     511             : 
     512     1270944 :           my_thread = omp_get_thread_num()
     513     1270944 :           n_threads = omp_get_num_threads()
     514             : 
     515     1270944 :           n_iter = 0
     516             : 
     517             :           ! first calculate A*v part of (A + VU**T + UV**T)*v
     518   404147376 :           uc_p(1:l_cols,my_thread) = 0.
     519   269993472 :           ur_p(1:l_rows,my_thread) = 0.
     520             : #endif /* WITH_OPENMP */
     521     8177088 :           do i= 0, (istep-2)/tile_size
     522     5635200 :             l_col_beg = i*l_cols_per_tile+1
     523     7181856 :             l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
     524     5635200 :             if (l_col_end < l_col_beg) cycle
     525    18463392 :             do j = 0, i
     526    12828192 :               l_row_beg = j*l_rows_per_tile+1
     527    17948304 :               l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
     528    12828192 :               if (l_row_end < l_row_beg) cycle
     529             : #ifdef WITH_OPENMP
     530     6402576 :               if (mod(n_iter,n_threads) == my_thread) then
     531     6402576 :                 if (wantDebug) call obj%timer%start("blas")
     532             :                 call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
     533             :                                     l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
     534             :                                     ONE, a_mat(l_row_beg,l_col_beg), lda,         &
     535     6402576 :                                     v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)
     536             : 
     537     6402576 :                 if (i/=j) then
     538             :                   call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
     539             :                                       ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &
     540             : 
     541     3596496 :                                       ONE, ur_p(l_row_beg,my_thread), 1)
     542             :                 endif
     543     6402576 :                 if (wantDebug) call obj%timer%stop("blas")
     544             :               endif
     545     6402576 :               n_iter = n_iter+1
     546             : #else /* WITH_OPENMP */
     547             : 
     548             :               ! multiplication by blocks is efficient only for CPU
     549             :               ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
     550             :               ! CPU implementation) or by one large matrix Vector multiply
     551     6402576 :               if (.not. useGPU) then
     552     6402576 :                 if (wantDebug) call obj%timer%start("blas")
     553             :                 call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
     554             :                             l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
     555             :                             ONE, a_mat(l_row_beg, l_col_beg), lda,         &
     556             :                             v_row(l_row_beg), 1,                           &
     557     6402576 :                             ONE, u_col(l_col_beg), 1)
     558             : 
     559     6402576 :                 if (i/=j) then
     560             : 
     561             :                   call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
     562             :                                       ONE, a_mat(l_row_beg,l_col_beg), lda,               &
     563             :                                       v_col(l_col_beg), 1,                                &
     564     3596496 :                                       ONE, u_row(l_row_beg), 1)
     565             :                 endif
     566     6402576 :                 if (wantDebug) call obj%timer%stop("blas")
     567             :               endif ! not useGPU
     568             : 
     569             : #endif /* WITH_OPENMP */
     570             :             enddo  ! j=0,i
     571             :           enddo  ! i=0,(istep-2)/tile_size
     572             : 
     573     2541888 :           if (useGPU) then
     574             :             if(mat_vec_as_one_block) then
     575             :               ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
     576             :               ! this requires altering of the algorithm when later explicitly updating the matrix
     577             :               ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
     578           0 :               if (wantDebug) call obj%timer%start("cublas")
     579             :               call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
     580             :                                         ONE, a_dev, lda,                   &
     581             :                                         v_row_dev , 1,                          &
     582           0 :                                         ONE, u_col_dev, 1)
     583             : 
     584             :        ! todo: try with non transposed!!!
     585             : !                 if(i/=j) then
     586             : !                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
     587             : !                                             ONE, a_dev + a_offset, lda,                        &
     588             : !                                             v_col_dev + (l_col_beg - 1) *                      &
     589             : !                                             size_of_datatype, 1,                          &
     590             : !                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
     591             : !                                             size_of_datatype, 1)
     592             : !                 endif
     593           0 :               if (wantDebug) call obj%timer%stop("cublas")
     594             : 
     595             :             else
     596             :               !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
     597             :               !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
     598             :               do i=0,(istep-2)/tile_size
     599             :                   l_col_beg = i*l_cols_per_tile+1
     600             :                   l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
     601             :                   if(l_col_end<l_col_beg) cycle
     602             : 
     603             :                   l_row_beg = 1
     604             :                   l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
     605             : 
     606             :                   a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
     607             :                           size_of_datatype
     608             : 
     609             :                   call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
     610             :                               l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
     611             :                               ONE, a_dev + a_offset, lda,  &
     612             :                               v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
     613             :                               ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
     614             :               enddo
     615             : 
     616             :               do i=0,(istep-2)/tile_size
     617             :                   l_col_beg = i*l_cols_per_tile+1
     618             :                   l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
     619             :                   if(l_col_end<l_col_beg) cycle
     620             : 
     621             :                   l_row_beg = 1
     622             :                   l_row_end = min(l_rows,i*l_rows_per_tile)
     623             : 
     624             :                   a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
     625             :                           size_of_datatype
     626             : 
     627             :                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
     628             :                               ONE, a_dev + a_offset, lda, &
     629             :                               v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
     630             :                               ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
     631             :               enddo
     632             :             end if !multiplication as one block / per stripes
     633             : 
     634           0 :             successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
     635           0 :             check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
     636             : 
     637           0 :             successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
     638           0 :             check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
     639             :           endif
     640             : 
     641             : !              call PRECISION_SYMV('U', l_cols,  &
     642             : !                         1.d0, a_mat, ubound(a_mat,1),  &
     643             : !                         v_row, 1,  &
     644             : !                         0.d0, u_col, 1)
     645             : 
     646             : !            endif ! useGPU
     647             : 
     648             : #ifdef WITH_OPENMP
     649             : !$OMP END PARALLEL
     650     1270944 :           call obj%timer%stop("OpenMP parallel")
     651             : 
     652     2541888 :           do i=0,max_threads-1
     653     1270944 :             u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
     654     1270944 :             u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
     655             :           enddo
     656             : #endif /* WITH_OPENMP */
     657             : 
     658             :           ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
     659     2541888 :           if (n_stored_vecs > 0) then
     660     2457408 :             if (wantDebug) call obj%timer%start("blas")
     661             : #if REALCASE == 1
     662             :             call PRECISION_GEMV('T',     &
     663             : #endif
     664             : #if COMPLEXCASE == 1
     665             :             call PRECISION_GEMV('C',     &
     666             : #endif
     667             :                                 l_rows, 2*n_stored_vecs,   &
     668             :                                 ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
     669     2457408 :                                 v_row,  1, ZERO, aux, 1)
     670             : 
     671             :             call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
     672             :                                 ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
     673     2457408 :                                 aux, 1, ONE, u_col,  1)
     674     2457408 :             if (wantDebug) call obj%timer%stop("blas")
     675             :           endif
     676             : 
     677             :         endif  ! (l_rows>0 .and. l_cols>0)
     678             : 
     679             :         ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
     680             :         ! on the processors containing the diagonal
     681             :         ! This is only necessary if u_row has been calculated, i.e. if the
     682             :         ! global tile size is smaller than the global remaining matrix
     683             : 
     684     2587968 :         if (tile_size < istep-1) then
     685             : 
     686             :           call elpa_reduce_add_vectors_&
     687             :           &MATH_DATATYPE&
     688             :           &_&
     689             :           &PRECISION &
     690             :           (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
     691     1185792 :            mpi_comm_cols, istep-1, 1, nblk)
     692             : 
     693             :         endif
     694             : 
     695             :         ! Sum up all the u_col(:) parts, transpose u_col -> u_row
     696             : 
     697     2587968 :         if (l_cols>0) then
     698     2587968 :           tmp(1:l_cols) = u_col(1:l_cols)
     699             : #ifdef WITH_MPI
     700     1725312 :           if (wantDebug) call obj%timer%start("mpi_communication")
     701             :           call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
     702     1725312 :                              MPI_SUM, mpi_comm_rows, mpierr)
     703     1725312 :           if (wantDebug) call obj%timer%stop("mpi_communication")
     704             : #else /* WITH_MPI */
     705      862656 :           u_col = tmp
     706             : #endif /* WITH_MPI */
     707             :         endif
     708             : 
     709             :         call elpa_transpose_vectors_&
     710             :         &MATH_DATATYPE&
     711             :         &_&
     712             :         &PRECISION &
     713             :         (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
     714     2587968 :          mpi_comm_rows, 1, istep-1, 1, nblk)
     715             : 
     716             :         ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
     717             : #if REALCASE == 1
     718     1315296 :         x = 0
     719     1315296 :         if (l_cols>0)  &
     720     1315296 :            x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
     721             : #endif
     722             : #if COMPLEXCASE == 1
     723     1272672 :         xc = 0
     724     1272672 :         if (l_cols>0)  &
     725     1272672 :            xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
     726             : #endif
     727             : 
     728             : #ifdef WITH_MPI
     729     1725312 :         if (wantDebug) call obj%timer%start("mpi_communication")
     730             : #if REALCASE == 1
     731      876864 :         call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
     732             : #endif
     733             : #if COMPLEXCASE == 1
     734      848448 :         call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
     735             : #endif
     736     1725312 :         if (wantDebug) call obj%timer%stop("mpi_communication")
     737             : #else /* WITH_MPI */
     738             : 
     739             : #if REALCASE == 1
     740      438432 :         vav = x
     741             : #endif
     742             : #if COMPLEXCASE == 1
     743      424224 :         vav = xc
     744             : #endif
     745             : 
     746             : #endif /* WITH_MPI */
     747             : 
     748             :         ! store u and v in the matrices U and V
     749             :         ! these matrices are stored combined in one here
     750             : 
     751   540033024 :         do j=1,l_rows
     752             : #if REALCASE == 1
     753   269795232 :           vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
     754   269795232 :           vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
     755             : #endif
     756             : #if COMPLEXCASE == 1
     757   267649824 :           vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
     758   267649824 :           vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
     759             : #endif
     760             :         enddo
     761   808755552 :         do j=1,l_cols
     762             : #if REALCASE == 1
     763   404692848 :           uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
     764   404692848 :           uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
     765             : #endif
     766             : #if COMPLEXCASE == 1
     767   401474736 :           uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
     768   401474736 :           uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
     769             : #endif
     770             :         enddo
     771             : 
     772             :         ! We have calculated another Hauseholder Vector, number of implicitly stored increased
     773     2587968 :         n_stored_vecs = n_stored_vecs+1
     774             : 
     775             :         ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
     776     2587968 :         if (n_stored_vecs == max_stored_uv .or. istep == 3) then
     777             : 
     778       84960 :           if (useGPU) then
     779             :             successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
     780             :                                       max_local_rows * 2 * max_stored_uv *          &
     781           0 :                                       size_of_datatype, cudaMemcpyHostToDevice)
     782           0 :             check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
     783             : 
     784             :             successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
     785             :                                       max_local_cols * 2 * max_stored_uv *          &
     786           0 :                                       size_of_datatype, cudaMemcpyHostToDevice)
     787           0 :             check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
     788             :           endif
     789             : 
     790      263520 :           do i = 0, (istep-2)/tile_size
     791             :             ! go over tiles above (or on) the diagonal
     792      178560 :             l_col_beg = i*l_cols_per_tile+1
     793      178560 :             l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
     794      178560 :             l_row_beg = 1
     795      178560 :             l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
     796      178560 :             if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
     797             :               cycle
     798             : 
     799             : 
     800      175008 :             if (useGPU) then
     801             :               if(.not. mat_vec_as_one_block) then
     802             :                 ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
     803             :                 ! we than use the same calls as for CPU version
     804             :                 if (wantDebug) call obj%timer%start("cublas")
     805             :                 call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
     806             :                                           l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
     807             :                                           ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
     808             :                                           size_of_datatype,  &
     809             :                                           max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
     810             :                                           size_of_datatype,  &
     811             :                                           max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) *            &
     812             :                                           size_of_datatype , lda)
     813             :                 if (wantDebug) call obj%timer%stop("cublas")
     814             :               endif
     815             :             else !useGPU
     816      175008 :               if (wantDebug) call obj%timer%start("blas")
     817             :               call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
     818             :                                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,    &
     819             :                                    ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1),   &
     820             :                                    uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1),        &
     821      175008 :                                    ONE, a_mat(l_row_beg,l_col_beg), lda)
     822      175008 :               if (wantDebug) call obj%timer%stop("blas")
     823             :             endif !useGPU
     824             :           enddo
     825             : 
     826       84960 :           if (useGPU) then
     827             :             if(mat_vec_as_one_block) then
     828             :               !update whole (remaining) part of matrix, including tiles below diagonal
     829             :               !we can do that in one large cublas call
     830           0 :               if (wantDebug) call obj%timer%start("cublas")
     831             :               call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs,   &
     832             :                                         ONE, vu_stored_rows_dev, max_local_rows, &
     833             :                                         uv_stored_cols_dev, max_local_cols,  &
     834           0 :                                         ONE, a_dev, lda)
     835           0 :               if (wantDebug) call obj%timer%stop("cublas")
     836             :             endif
     837             :           endif
     838             : 
     839       84960 :           n_stored_vecs = 0
     840             :         endif
     841             : 
     842     2587968 :         if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
     843     1725312 :           if (useGPU) then
     844             :             !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
     845           0 :              a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
     846             : 
     847             :              successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
     848           0 :                                        1 *  size_of_datatype, cudaMemcpyDeviceToHost)
     849           0 :              check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
     850             : 
     851             :           endif
     852     1725312 :           if (n_stored_vecs > 0) then
     853             :             a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
     854     1668672 :                         + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
     855             :           end if
     856             : #if REALCASE == 1
     857      876864 :           d_vec(istep-1) = a_mat(l_rows,l_cols)
     858             : #endif
     859             : #if COMPLEXCASE == 1
     860      848448 :           d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk)
     861             : #endif
     862             : 
     863     1725312 :           if (useGPU) then
     864             :             !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
     865             :             !successCUDA = cuda_threadsynchronize()
     866             :             !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
     867             : 
     868             :              successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
     869           0 :                                        int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
     870           0 :              check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
     871             :           endif
     872             :         endif
     873             : 
     874             :       enddo ! main cycle over istep=na,3,-1
     875             : 
     876             : #if COMPLEXCASE == 1
     877             :       ! Store e_vec(1) and d_vec(1)
     878             : 
     879        4464 :       if (my_pcol==pcol(2, nblk, np_cols)) then
     880        4464 :         if (my_prow==prow(1, nblk, np_rows)) then
     881             :           ! We use last l_cols value of loop above
     882        2976 :           if(useGPU) then
     883             :             successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
     884           0 :                                     1 * size_of_datatype, cudaMemcpyDeviceToHost)
     885           0 :             check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
     886           0 :             vrl = aux3(1)
     887             :           else !useGPU
     888        2976 :             vrl = a_mat(1,l_cols)
     889             :           endif !useGPU
     890             :           call hh_transform_complex_&
     891             :                                     &PRECISION &
     892        2976 :                                     (obj, vrl, 0.0_rk, xf, tau(2), wantDebug)
     893             : #if REALCASE == 1
     894             :           e_vec(1) = vrl
     895             : #endif
     896             : #if COMPLEXCASE == 1
     897        2976 :           e_vec(1) = real(vrl,kind=rk)
     898             : #endif
     899             : 
     900             : 
     901        2976 :           a_mat(1,l_cols) = 1. ! for consistency only
     902             :         endif
     903             : #ifdef WITH_MPI
     904        2976 :         if (wantDebug) call obj%timer%start("mpi_communication")
     905        2976 :         call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
     906        2976 :         if (wantDebug) call obj%timer%stop("mpi_communication")
     907             : 
     908             : #endif /* WITH_MPI */
     909             :       endif
     910             : 
     911             : #ifdef WITH_MPI
     912        2976 :       if (wantDebug) call obj%timer%start("mpi_communication")
     913        2976 :       call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
     914        2976 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     915             : 
     916             : #endif /* WITH_MPI */
     917        4464 :       if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
     918        2976 :         if(useGPU) then
     919             :           successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
     920           0 :                                     1 * size_of_datatype, cudaMemcpyDeviceToHost)
     921           0 :           check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
     922           0 :           d_vec(1) = PRECISION_REAL(aux3(1))
     923             :         else !useGPU
     924        2976 :           d_vec(1) = PRECISION_REAL(a_mat(1,1))
     925             :         endif !useGPU
     926             :       endif
     927             : 
     928             : #endif /* COMPLEXCASE == 1 */
     929             : 
     930             : #if REALCASE == 1
     931             :       ! Store e_vec(1)
     932             : 
     933        4752 :       if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
     934        3168 :         if(useGPU) then
     935             :           successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
     936           0 :                                     1 * size_of_datatype, cudaMemcpyDeviceToHost)
     937           0 :           check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
     938             :         else !useGPU
     939        3168 :           e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
     940             :         endif !useGPU
     941             :       endif
     942             : 
     943             :      ! Store d_vec(1)
     944        4752 :       if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
     945        3168 :         if(useGPU) then
     946           0 :           successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
     947           0 :           check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
     948             :         else !useGPU
     949        3168 :           d_vec(1) = a_mat(1,1)
     950             :         endif !useGPU
     951             :       endif
     952             : #endif
     953             : 
     954        9216 :       deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
     955        9216 :       if (istat .ne. 0) then
     956             : #if REALCASE == 1
     957           0 :         print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
     958             : #endif
     959             : #if COMPLEXCASE == 1
     960           0 :         print *,"tridiag_complex: error when deallocating tmp "//errorMessage
     961             : #endif
     962           0 :         stop 1
     963             :       endif
     964             : 
     965        9216 :       if (useGPU) then
     966             :         ! todo: should we leave a_mat on the device for further use?
     967           0 :         successCUDA = cuda_free(a_dev)
     968           0 :         check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
     969             : 
     970           0 :         successCUDA = cuda_free(v_row_dev)
     971           0 :         check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
     972             : 
     973           0 :         successCUDA = cuda_free(u_row_dev)
     974           0 :         check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
     975             : 
     976           0 :         successCUDA = cuda_free(v_col_dev)
     977           0 :         check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
     978             : 
     979           0 :         successCUDA = cuda_free(u_col_dev)
     980           0 :         check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
     981             : 
     982           0 :         successCUDA = cuda_free(vu_stored_rows_dev)
     983           0 :         check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
     984             : 
     985           0 :         successCUDA = cuda_free(uv_stored_cols_dev)
     986           0 :         check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
     987             :       endif
     988             : 
     989             :       ! distribute the arrays d_vec and e_vec to all processors
     990             : 
     991             : #if REALCASE == 1
     992        4752 :       allocate(tmp(na), stat=istat, errmsg=errorMessage)
     993             : 
     994        4752 :       if (istat .ne. 0) then
     995           0 :         print *,"tridiag_real: error when allocating tmp "//errorMessage
     996           0 :         stop 1
     997             :       endif
     998             : #endif
     999             : #if COMPLEXCASE == 1
    1000        4464 :       allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
    1001        4464 :       if (istat .ne. 0) then
    1002           0 :         print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
    1003           0 :         stop 1
    1004             :       endif
    1005             : #endif
    1006             : 
    1007             : 
    1008             : #ifdef WITH_MPI
    1009        6144 :       if (wantDebug) call obj%timer%start("mpi_communication")
    1010             : #if REALCASE == 1
    1011        3168 :       tmp = d_vec
    1012        3168 :       call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
    1013        3168 :       tmp = d_vec
    1014        3168 :       call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
    1015        3168 :       tmp = e_vec
    1016        3168 :       call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
    1017        3168 :       tmp = e_vec
    1018        3168 :       call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
    1019             : #endif
    1020             : #if COMPLEXCASE == 1
    1021        2976 :       tmp_real = d_vec
    1022        2976 :       call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
    1023        2976 :       tmp_real = d_vec
    1024        2976 :       call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
    1025        2976 :       tmp_real = e_vec
    1026        2976 :       call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
    1027        2976 :       tmp_real = e_vec
    1028        2976 :       call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
    1029             : #endif
    1030        6144 :       if (wantDebug) call obj%timer%stop("mpi_communication")
    1031             : #endif /* WITH_MPI */
    1032             : 
    1033             : #if REALCASE == 1
    1034        4752 :       deallocate(tmp,  stat=istat, errmsg=errorMessage)
    1035        4752 :       if (istat .ne. 0) then
    1036           0 :         print *,"tridiag_real: error when deallocating tmp "//errorMessage
    1037           0 :         stop 1
    1038             :       endif
    1039             : #endif
    1040             : #if COMPLEXCASE == 1
    1041        4464 :       deallocate(tmp_real, stat=istat, errmsg=errorMessage)
    1042        4464 :       if (istat .ne. 0) then
    1043           0 :         print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
    1044           0 :         stop 1
    1045             :       endif
    1046             : #endif
    1047             : 
    1048             :       call obj%timer%stop("tridiag_&
    1049             :       &MATH_DATATYPE&
    1050             :       &" // &
    1051             :       &PRECISION_SUFFIX &
    1052        9216 :       )
    1053             :     end subroutine tridiag_&
    1054             :     &MATH_DATATYPE&
    1055             :     &_&
    1056        9216 :     &PRECISION

Generated by: LCOV version 1.12