LCOV - code coverage report
Current view: top level - src/elpa2 - elpa2_bandred_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 240 540 44.4 %
Date: 2018-01-10 09:29:53 Functions: 19 19 100.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : !
      47             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      48             : !
      49             : ! Copyright of the original code rests with the authors inside the ELPA
      50             : ! consortium. The copyright of any additional modifications shall rest
      51             : ! with their original authors, but shall adhere to the licensing terms
      52             : ! distributed along with the original code in the file "COPYING".
      53             : 
      54             : 
      55             : 
      56             : ! ELPA2 -- 2-stage solver for ELPA
      57             : !
      58             : ! Copyright of the original code rests with the authors inside the ELPA
      59             : ! consortium. The copyright of any additional modifications shall rest
      60             : ! with their original authors, but shall adhere to the licensing terms
      61             : ! distributed along with the original code in the file "COPYING".
      62             : #endif
      63             :     subroutine bandred_&
      64             :     &MATH_DATATYPE&
      65             :     &_&
      66       46584 :     &PRECISION &
      67       46584 :     (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
      68             :      tmat_dev, wantDebug, useGPU, success &
      69             : #if REALCASE == 1
      70             :      , useQR)
      71             : #endif
      72             : #if COMPLEXCASE == 1
      73             :      )
      74             : #endif
      75             : 
      76             :   !-------------------------------------------------------------------------------
      77             :   !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
      78             :   !
      79             :   !  Parameters
      80             :   !
      81             :   !  na          Order of matrix
      82             :   !
      83             :   !  a(lda,matrixCols)    Distributed matrix which should be reduced.
      84             :   !              Distribution is like in Scalapack.
      85             :   !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
      86             :   !              a(:,:) is overwritten on exit with the band and the Householder vectors
      87             :   !              in the upper half.
      88             :   !
      89             :   !  lda         Leading dimension of a
      90             :   !  matrixCols  local columns of matrix a
      91             :   !
      92             :   !  nblk        blocksize of cyclic distribution, must be the same in both directions!
      93             :   !
      94             :   !  nbw         semi bandwith of output matrix
      95             :   !
      96             :   !  mpi_comm_rows
      97             :   !  mpi_comm_cols
      98             :   !              MPI-Communicators for rows/columns
      99             :   !
     100             :   !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
     101             :   !              Factors for the Householder vectors (returned), needed for back transformation
     102             :   !
     103             :   !-------------------------------------------------------------------------------
     104             : 
     105             :       use cuda_functions
     106             :       use iso_c_binding
     107             :       use elpa1_compute
     108             : #ifdef WITH_OPENMP
     109             :       use omp_lib
     110             : #endif
     111             :       use precision
     112             :       use elpa_abstract_impl
     113             :       implicit none
     114             : #include "../general/precision_kinds.F90"
     115             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     116             :       integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
     117             : 
     118             : #ifdef USE_ASSUMED_SIZE
     119             :       MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
     120             : #else
     121             :       MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
     122             : #endif
     123             : 
     124             : #if REALCASE == 1
     125             :       real(kind=rk)                               :: eps
     126             : #endif
     127             :       logical, intent(in)                         :: useGPU
     128             : 
     129             :       integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
     130             :       integer(kind=ik)                            :: l_cols, l_rows
     131             : #if REALCASE == 1
     132             :       integer(kind=ik)                            :: vmrCols
     133             : #endif
     134             : #ifdef WITH_OPENMP
     135             :       integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
     136             : #endif
     137             :       integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
     138             :       integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
     139             :       integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
     140             : 
     141             :       real(kind=rk)                    :: vnorm2
     142      186336 :       MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
     143             : 
     144             : !      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
     145      139752 :       MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
     146      139752 :       MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
     147       46584 :       MATH_DATATYPE(kind=rck), allocatable :: vr(:)
     148             : 
     149             : #if REALCASE == 1
     150             :       ! needed for blocked QR decomposition
     151             :       integer(kind=ik)                            :: PQRPARAM(11), work_size
     152             :       real(kind=rk)                    :: dwork_size(1)
     153       56304 :       real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
     154             : #endif
     155             :       ! a_dev is passed from bandred_real to trans_ev_band
     156             :       integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
     157             : #ifdef WITH_MPI
     158             :       integer(kind=ik), external                  :: numroc
     159             : #endif
     160             :       integer(kind=ik)                            :: ierr
     161             :       integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
     162             :       integer(kind=c_intptr_t)                      :: lc_start, lc_end
     163             : #if COMPLEXCASE == 1
     164             :       integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
     165             : #endif
     166             :       integer(kind=ik)                            :: lr_end
     167             :       integer(kind=ik)                            :: na_cols
     168             : #if COMPLEXCASE == 1
     169             :       integer(kind=ik)                            :: na_rows
     170             : #endif
     171             : 
     172             :       logical, intent(in)                         :: wantDebug
     173             :       logical, intent(out)                        :: success
     174             :       logical                                     :: successCUDA
     175             :       integer(kind=ik)                            :: istat
     176             :       character(200)                              :: errorMessage
     177             : 
     178             : #if REALCASE == 1
     179             :       logical, intent(in)                         :: useQR
     180             : #endif
     181             :       integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
     182             :                                                     ii, pp
     183             :       integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
     184             :                                                                         &PRECISION&
     185             :                                                                         &_&
     186             :                                                                         &MATH_DATATYPE
     187             : 
     188             :       call obj%timer%start("bandred_&
     189             :       &MATH_DATATYPE&
     190             :       &" // &
     191             :       &PRECISION_SUFFIX &
     192       46584 :       )
     193       46584 :       if (wantDebug) call obj%timer%start("mpi_communication")
     194             : 
     195       46584 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     196       46584 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     197       46584 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     198       46584 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     199             : 
     200       46584 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     201       46584 :       success = .true.
     202             : 
     203             : 
     204             :       ! Semibandwith nbw must be a multiple of blocksize nblk
     205       46584 :       if (mod(nbw,nblk)/=0) then
     206           0 :         if (my_prow==0 .and. my_pcol==0) then
     207           0 :           if (wantDebug) then
     208             :             write(error_unit,*) 'ELPA2_bandred_&
     209             :       &MATH_DATATYPE&
     210           0 :       &: ERROR: nbw=',nbw,', nblk=',nblk
     211             :             write(error_unit,*) 'ELPA2_bandred_&
     212             :       &MATH_DATATYPE&
     213           0 :       &: ELPA2 works only for nbw==n*nblk'
     214             :           endif
     215           0 :           success = .false.
     216           0 :           return
     217             :         endif
     218             :       endif
     219             : 
     220             : ! na_rows in used nowhere; only na_cols
     221       46584 :       if (useGPU) then
     222             : #ifdef WITH_MPI
     223             : #if COMPLEXCASE == 1
     224           0 :         na_rows = numroc(na, nblk, my_prow, 0, np_rows)
     225             : #endif
     226           0 :         na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
     227             : #else
     228             : #if COMPLEXCASE == 1
     229           0 :          na_rows = na
     230             : #endif
     231           0 :         na_cols = na
     232             : #endif /* WITH_MPI */
     233             : 
     234             :         ! Here we convert the regular host array into a pinned host array
     235           0 :         successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
     236           0 :         if (.not.(successCUDA)) then
     237             :           print *,"bandred_&
     238             :                   &MATH_DATATYPE&
     239           0 :                   &: error in cudaMalloc a_dev 1"
     240           0 :           stop 1
     241             :         endif
     242             : 
     243           0 :         successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
     244           0 :         if (.not.(successCUDA)) then
     245             :           print *,"bandred_&
     246             :                   &MATH_DATATYPE&
     247           0 :                   &: error in cudaMalloc tmat_dev 1"
     248           0 :           stop 1
     249             :         endif
     250             : 
     251           0 :         successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
     252           0 :         if (.not.(successCUDA)) then
     253             :           print *,"bandred_&
     254             :                   &MATH_DATATYPE&
     255           0 :                   &: error in cudaMalloc vav_dev 1"
     256           0 :           stop 1
     257             :         endif
     258             :       endif ! useGPU
     259             : 
     260             :       ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
     261             : 
     262       46584 :       tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
     263       46584 :       tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
     264             : 
     265       46584 :       l_rows_tile = tile_size/np_rows ! local rows of a tile
     266       46584 :       l_cols_tile = tile_size/np_cols ! local cols of a tile
     267             : 
     268             : #if REALCASE == 1
     269       28152 :       if (useQR) then
     270             : 
     271           0 :         if (useGPU) then
     272           0 :           print *,"qr decomposition at the moment not supported with GPU"
     273           0 :           stop 1
     274             :         endif
     275             : 
     276           0 :         if (which_qr_decomposition == 1) then
     277           0 :           call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
     278           0 :           allocate(tauvector(na), stat=istat, errmsg=errorMessage)
     279           0 :           if (istat .ne. 0) then
     280           0 :             print *,"bandred_real: error when allocating tauvector "//errorMessage
     281           0 :             stop 1
     282             :           endif
     283             : 
     284           0 :           allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
     285           0 :           if (istat .ne. 0) then
     286           0 :             print *,"bandred_real: error when allocating blockheuristic "//errorMessage
     287           0 :             stop 1
     288             :           endif
     289             : 
     290           0 :           l_rows = local_index(na, my_prow, np_rows, nblk, -1)
     291           0 :           allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
     292           0 :           if (istat .ne. 0) then
     293           0 :             print *,"bandred_real: error when allocating vmrCPU "//errorMessage
     294           0 :             stop 1
     295             :           endif
     296             : 
     297           0 :           vmrCols = na
     298             : 
     299             : #ifdef USE_ASSUMED_SIZE_QR
     300             :           call qr_pdgeqrf_2dcomm_&
     301             :                &PRECISION&
     302             :                &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
     303             :                                  nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
     304             :                                  mpi_comm_rows, mpi_comm_cols, blockheuristic)
     305             : 
     306             : #else
     307             :           call qr_pdgeqrf_2dcomm_&
     308             :                &PRECISION&
     309             :                &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
     310             :                                  vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
     311             :                                  nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
     312           0 :                                  mpi_comm_rows, mpi_comm_cols, blockheuristic)
     313             : #endif
     314             : 
     315           0 :           work_size = int(dwork_size(1))
     316           0 :           allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
     317           0 :           if (istat .ne. 0) then
     318           0 :             print *,"bandred_real: error when allocating work_blocked "//errorMessage
     319           0 :             stop 1
     320             :           endif
     321           0 :           work_blocked = 0.0_rk
     322           0 :           deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
     323           0 :           if (istat .ne. 0) then
     324           0 :             print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
     325           0 :             stop 1
     326             :           endif
     327             : 
     328             :         endif ! which_qr_decomposition
     329             : 
     330             :       endif ! useQr
     331             : #endif /* REALCASE */
     332             : 
     333       46584 :       if (useGPU) then
     334             : 
     335           0 :         cur_l_rows = 0
     336           0 :         cur_l_cols = 0
     337             : 
     338           0 :         successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
     339           0 :         if (.not.(successCUDA)) then
     340             :           print *,"bandred_&
     341             :                   &MATH_DATATYPE&
     342           0 :                   &: error in cudaMemcpy a_dev 2"
     343           0 :           stop 1
     344             :         endif
     345             :       endif ! useGPU
     346             : 
     347             : 
     348      222696 :       do istep = (na-1)/nbw, 1, -1
     349             : 
     350      176112 :         n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
     351             : 
     352             :         ! Number of local columns/rows of remaining matrix
     353      176112 :         l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     354      176112 :         l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
     355             : 
     356             :         ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
     357             : 
     358      176112 :         if (useGPU) then
     359           0 :           cur_l_rows = max(l_rows, 1)
     360           0 :           cur_l_cols = max(l_cols, 1)
     361             : 
     362           0 :           vmr_size = cur_l_rows * 2 * n_cols
     363           0 :           umc_size = cur_l_cols * 2 * n_cols
     364             : 
     365             :           ! Allocate vmr and umc only if the inew size exceeds their current capacity
     366             :           ! Added for FORTRAN CALLS
     367           0 :           if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
     368           0 :             if (allocated(vr)) then
     369           0 :               deallocate(vr, stat=istat, errmsg=errorMessage)
     370           0 :               if (istat .ne. 0) then
     371             :                 print *,"bandred_&
     372             :                         &MATH_DATATYPE&
     373           0 :                         &: error when deallocating vr "//errorMessage
     374           0 :                 stop 1
     375             :               endif
     376             :             endif
     377           0 :             allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
     378           0 :             if (istat .ne. 0) then
     379             :               print *,"bandred_&
     380             :                       &MATH_DATATYPE&
     381           0 :                       &: error when allocating vr "//errorMessage
     382           0 :               stop 1
     383             :             endif
     384             : 
     385             :           endif
     386             : 
     387           0 :           if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
     388           0 :             if (allocated(vmrCUDA)) then
     389           0 :               deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
     390           0 :               if (istat .ne. 0) then
     391             :                 print *,"bandred_&
     392             :                         &MATH_DATATYPE&
     393           0 :                         &: error when allocating vmrCUDA "//errorMessage
     394           0 :                 stop 1
     395             :               endif
     396             : 
     397           0 :               successCUDA = cuda_free(vmr_dev)
     398           0 :               if (.not.(successCUDA)) then
     399             :                 print *,"bandred_&
     400           0 :                         &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
     401           0 :                 stop 1
     402             :               endif
     403             :             endif
     404             : 
     405           0 :             allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
     406             : 
     407           0 :             if (istat .ne. 0) then
     408             :               print *,"bandred_&
     409             :                       &MATH_DATATYPE&
     410           0 :                       &: error when allocating vmrCUDA "//errorMessage
     411           0 :               stop 1
     412             :             endif
     413           0 :             successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
     414           0 :             if (.not.(successCUDA)) then
     415             :               print *,"bandred_&
     416             :                       &MATH_DATATYPE&
     417           0 :                       &: error in cudaMalloc: vmr_dev2"
     418           0 :               stop 1
     419             :             endif
     420             : 
     421             :           endif
     422             : 
     423           0 :           if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
     424           0 :             if (allocated(umcCUDA)) then
     425           0 :               deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
     426           0 :               if (istat .ne. 0) then
     427             :                 print *,"bandred_&
     428             :                         &MATH_DATATYPE&
     429           0 :                         &: error when deallocating umcCUDA "//errorMessage
     430           0 :                 stop 1
     431             :               endif
     432             : 
     433           0 :               successCUDA = cuda_free(umc_dev)
     434           0 :               if (.not.(successCUDA)) then
     435             :                  print *,"bandred_&
     436             :                          &MATH_DATATYPE&
     437           0 :                          &: error in cudaFree umc_dev 1"
     438           0 :                  stop 1
     439             :               endif
     440             : 
     441             :             endif
     442             : 
     443           0 :             allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
     444             : 
     445           0 :             if (istat .ne. 0) then
     446             :               print *,"bandred_&
     447             :                       &MATH_DATATYPE&
     448           0 :                       &: error when deallocating umcCUDA "//errorMessage
     449           0 :               stop 1
     450             :             endif
     451             : 
     452           0 :             successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
     453           0 :             if (.not.(successCUDA)) then
     454             :               print *,"bandred_&
     455             :                       &MATH_DATATYPE&
     456           0 :                       &: error in cudaMalloc umc_dev 2"
     457           0 :               stop 1
     458             :             endif
     459             : 
     460             :           endif
     461             : 
     462             :         else ! GPU not used
     463             : 
     464             :           ! unify the the name vmr and vmrCPU, as well as vmrGPU
     465             :           ! the same for umcCPU and umcGPU
     466             :           ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
     467             : 
     468      176112 :           allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
     469      176112 :           if (istat .ne. 0) then
     470             :             print *,"bandred_&
     471             :                      &MATH_DATATYPE&
     472           0 :                      &: error when allocating vmrCPU "//errorMessage
     473           0 :             stop 1
     474             :           endif
     475             : 
     476      176112 :           allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
     477      176112 :           if (istat .ne. 0) then
     478             :             print *,"bandred_&
     479             :                     &MATH_DATATYPE&
     480           0 :                     &: error when allocating umcCPU "//errorMessage
     481           0 :             stop 1
     482             :           endif
     483             : 
     484      176112 :           allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
     485      176112 :           if (istat .ne. 0) then
     486             :             print *,"bandred_&
     487             :                     &MATH_DATATYPE&
     488           0 :                     &: error when allocating vr "//errorMessage
     489           0 :             stop 1
     490             :           endif
     491             : 
     492             :         endif ! use GPU
     493             : 
     494      176112 :         if (useGPU) then
     495           0 :           vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
     496             :         else
     497      176112 :           vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
     498             :         endif ! useGPU
     499             : 
     500      176112 :         vr(:) = 0.0_rck
     501      176112 :         tmat(:,:,istep) = 0.0_rck
     502      176112 :         if (useGPU) then
     503             : #if REALCASE == 1
     504           0 :           umcCUDA(1 : umc_size) = 0.0_rck
     505             : #endif
     506           0 :           lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
     507           0 :           lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
     508           0 :           lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
     509             : 
     510           0 :           if (lc_start .le. 0) lc_start = 1
     511             : 
     512             :           ! Here we assume that the processor grid and the block grid are aligned
     513           0 :           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
     514             : 
     515           0 :           if (my_pcol == cur_pcol) then
     516             :             successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
     517             :                                       int((lda*size_of_datatype),kind=c_intptr_t), &
     518             :                                             (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
     519             :                                             int(lda*size_of_datatype,kind=c_intptr_t),              &
     520             :                                             int(lr_end*size_of_datatype,kind=c_intptr_t),           &
     521           0 :                                              int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
     522             : 
     523             : 
     524           0 :             if (.not.(successCUDA)) then
     525             :               print *,"bandred_&
     526             :                       &MATH_DATATYPE&
     527           0 :                       &: error in cudaMemcpy2d"
     528           0 :               stop 1
     529             :             endif
     530             :           endif
     531             :         endif ! useGPU
     532             : 
     533             :         ! Reduce current block to lower triangular form
     534             : #if REALCASE == 1
     535       71280 :         if (useQR) then
     536           0 :           if (which_qr_decomposition == 1) then
     537           0 :             vmrCols = 2*n_cols
     538             : #ifdef USE_ASSUMED_SIZE_QR
     539             :             call qr_pdgeqrf_2dcomm_&
     540             :                  &PRECISION&
     541             :                  &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
     542             :                                    na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
     543             :                                      work_size, na, n_cols, nblk, nblk,        &
     544             :                                      istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
     545             :                                      0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
     546             :                                      blockheuristic)
     547             : 
     548             : #else
     549             :             call qr_pdgeqrf_2dcomm_&
     550             :                  &PRECISION&
     551             :                  &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
     552             :                                     max(l_rows,1), vmrCols, tauvector(1:na), na, &
     553             :                                      tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
     554             :                                      work_size, na, n_cols, nblk, nblk,        &
     555             :                                      istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
     556             :                                      0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
     557           0 :                                      blockheuristic)
     558             : #endif
     559             :           endif
     560             : 
     561             :        else !useQR
     562             : #endif /* REALCASE == 1 */
     563     6683976 :          do lc = n_cols, 1, -1
     564             : 
     565     6554448 :            ncol = istep*nbw + lc ! absolute column number of householder Vector
     566     6554448 :            nrow = ncol - nbw ! Absolute number of pivot row
     567             : 
     568     6554448 :            lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
     569     6554448 :            lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
     570             : 
     571     6554448 :            tau = 0
     572             : 
     573     6554448 :            if (nrow == 1) exit ! Nothing to do
     574             : 
     575     6507864 :            cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
     576             : 
     577     6507864 :            if (my_pcol==cur_pcol) then
     578             : 
     579             :              ! Get Vector to be transformed; distribute last element and norm of
     580             :              ! remaining elements to all procs in current column
     581             : 
     582     6507864 :              vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
     583             : 
     584     6507864 :              if (my_prow==prow(nrow, nblk, np_rows)) then
     585     4338576 :                aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
     586     4338576 :                aux1(2) = vr(lr)
     587             :              else
     588     2169288 :                aux1(1) = dot_product(vr(1:lr),vr(1:lr))
     589     2169288 :                aux1(2) = 0.0_rck
     590             :              endif
     591             : 
     592             : #ifdef WITH_MPI
     593     4338576 :              if (wantDebug) call obj%timer%start("mpi_communication")
     594             :              call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
     595     4338576 :                                 MPI_SUM, mpi_comm_rows, mpierr)
     596     4338576 :              if (wantDebug) call obj%timer%stop("mpi_communication")
     597             : 
     598             : #else /* WITH_MPI */
     599     2169288 :               aux2 = aux1 ! this should be optimized
     600             : #endif
     601             : 
     602             : #if REALCASE == 1
     603     3372120 :              vnorm2 = aux2(1)
     604             : #endif
     605             : #if COMPLEXCASE == 1
     606     3135744 :              vnorm2 = real(aux2(1),kind=rk)
     607             : #endif
     608     6507864 :              vrl    = aux2(2)
     609             : 
     610             :              ! Householder transformation
     611             :        call hh_transform_&
     612             :              &MATH_DATATYPE&
     613             :              &_&
     614             :              &PRECISION &
     615     6507864 :                          (obj, vrl, vnorm2, xf, tau, wantDebug)
     616             :              ! Scale vr and store Householder Vector for back transformation
     617             : 
     618     6507864 :              vr(1:lr) = vr(1:lr) * xf
     619     6507864 :              if (my_prow==prow(nrow, nblk, np_rows)) then
     620     4338576 :                a(1:lr-1,lch) = vr(1:lr-1)
     621     4338576 :                a(lr,lch) = vrl
     622     4338576 :                vr(lr) = 1.0_rck
     623             :              else
     624     2169288 :                a(1:lr,lch) = vr(1:lr)
     625             :              endif
     626             : 
     627             :            endif
     628             : 
     629             :            ! Broadcast Householder Vector and tau along columns
     630             : 
     631     6507864 :            vr(lr+1) = tau
     632             : #ifdef WITH_MPI
     633     4338576 :            if (wantDebug) call obj%timer%start("mpi_communication")
     634             :            call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
     635     4338576 :                           cur_pcol, mpi_comm_cols, mpierr)
     636     4338576 :            if (wantDebug) call obj%timer%stop("mpi_communication")
     637             : 
     638             : #endif /* WITH_MPI */
     639             : 
     640     6507864 :            if (useGPU) then
     641           0 :              vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
     642             :            else
     643     6507864 :              vmrCPU(1:lr,lc) = vr(1:lr)
     644             :            endif
     645     6507864 :            tau = vr(lr+1)
     646             : 
     647             : #if REALCASE == 1
     648     3372120 :            tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
     649             : #endif
     650             : #if COMPLEXCASE == 1
     651     3135744 :            tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
     652             : #endif
     653             :            ! Transform remaining columns in current block with Householder Vector
     654             :            ! Local dot product
     655             : 
     656     6507864 :            aux1 = 0.0_rck
     657             : 
     658             : #ifdef WITH_OPENMP
     659             : #if 0
     660             :  ! original complex implementation without openmp. check performance
     661             :             nlc = 0 ! number of local columns
     662             :            do j=1,lc-1
     663             :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     664             :              if (lcx>0) then
     665             :                nlc = nlc+1
     666             :                aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
     667             :              endif
     668             :            enddo
     669             : 
     670             :            ! Get global dot products
     671             : #ifdef WITH_MPI
     672             :            if (wantDebug) call obj%timer%start("mpi_communication")
     673             :            if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
     674             : 
     675             :            ! Transform
     676             : 
     677             :            nlc = 0
     678             :            do j=1,lc-1
     679             :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     680             :              if (lcx>0) then
     681             :                nlc = nlc+1
     682             :                a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
     683             : 
     684             :              endif
     685             :            enddo
     686             : 
     687             : 
     688             :            if (wantDebug) call obj%timer%stop("mpi_communication")
     689             : 
     690             : #else /* WITH_MPI */
     691             : !          if (nlc>0) aux2=aux1
     692             : 
     693             :            ! Transform
     694             : 
     695             :            nlc = 0
     696             :            do j=1,lc-1
     697             :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     698             :              if (lcx>0) then
     699             :                nlc = nlc+1
     700             :                a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
     701             :              endif
     702             :            enddo
     703             : 
     704             : #endif /* WITH_MPI */
     705             : !
     706             : !           ! Transform
     707             : !
     708             : !           nlc = 0
     709             : !           do j=1,lc-1
     710             : !             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     711             : !             if (lcx>0) then
     712             : !               nlc = nlc+1
     713             : !               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
     714             : 
     715             : !             endif
     716             : !           enddo
     717             : #endif /* if 0 */
     718             : 
     719             :            !Open up one omp region to avoid paying openmp overhead.
     720             :            !This does not help performance due to the addition of two openmp barriers around the MPI call,
     721             :            !But in the future this may be beneficial if these barriers are replaced with a faster implementation
     722             : 
     723     6507864 :            !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
     724     3253932 :            mynlc = 0 ! number of local columns
     725             : 
     726             :            !This loop does not have independent iterations,
     727             :            !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
     728             :            !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
     729             :            !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
     730             :            !the iteration number modulo the number of threads
     731    73733904 :            do j=1,lc-1
     732    70479972 :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     733    70479972 :              if (lcx>0 ) then
     734    70479972 :                mynlc = mynlc+1
     735    70479972 :                if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
     736    70479972 :                    if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
     737             :                endif
     738             :              endif
     739             :            enddo
     740             : 
     741             :            ! Get global dot products
     742             : 
     743     3253932 :            !$omp barrier
     744             :            !$omp single
     745             : #ifdef WITH_MPI
     746     2169288 :            if (wantDebug) call obj%timer%start("mpi_communication")
     747     2169288 :            if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
     748     2126112 :                                            MPI_SUM, mpi_comm_rows, mpierr)
     749     2169288 :            if (wantDebug) call obj%timer%stop("mpi_communication")
     750             : #else /* WITH_MPI */
     751     1084644 :            if (mynlc>0) aux2 = aux1
     752             : #endif /* WITH_MPI */
     753             :            !$omp end single
     754     3253932 :            !$omp barrier
     755             : 
     756             :            ! Transform
     757     3253932 :            transformChunkSize=32
     758     3253932 :            mynlc = 0
     759    73733904 :            do j=1,lc-1
     760    70479972 :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     761    70479972 :              if (lcx>0) then
     762    70479972 :                mynlc = mynlc+1
     763             :                !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
     764             :                !However, for some reason this is slower than doing it manually, so it is parallelized as below.
     765   479168940 :                do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
     766  9942516564 :                   do pp = 1,transformChunkSize
     767  9674787540 :                       if (pp + ii > lr) exit
     768             : #if REALCASE == 1
     769  6205992816 :                           a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
     770             : #endif
     771             : #if COMPLEXCASE == 1
     772  3398314752 :                           a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
     773             : #endif
     774             :                   enddo
     775             :                enddo
     776             :              endif
     777             :            enddo
     778             :            !$omp end parallel
     779             : 
     780             : #else /* WITH_OPENMP */
     781             : 
     782     3253932 :            nlc = 0 ! number of local columns
     783    73733904 :            do j=1,lc-1
     784    70479972 :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     785    70479972 :              if (lcx>0) then
     786    70479972 :                nlc = nlc+1
     787    70479972 :                if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
     788             :              endif
     789             :            enddo
     790             : 
     791             :            ! Get global dot products
     792             : #ifdef WITH_MPI
     793     2169288 :            if (wantDebug) call obj%timer%start("mpi_communication")
     794     2169288 :            if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
     795     2126112 :                                          MPI_SUM, mpi_comm_rows, mpierr)
     796     2169288 :            if (wantDebug) call obj%timer%stop("mpi_communication")
     797             : #else /* WITH_MPI */
     798     1084644 :            if (nlc>0) aux2=aux1
     799             : #endif /* WITH_MPI */
     800             :            ! Transform
     801             : 
     802     3253932 :            nlc = 0
     803    73733904 :            do j=1,lc-1
     804    70479972 :              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
     805    70479972 :              if (lcx>0) then
     806    70479972 :                nlc = nlc+1
     807             : #if REALCASE == 1
     808    47040804 :                a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
     809             : #endif
     810             : #if COMPLEXCASE == 1
     811    23439168 :                a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
     812             : #endif
     813             :              endif
     814             :            enddo
     815             : #endif /* WITH_OPENMP */
     816             :          enddo ! lc
     817             : 
     818      176112 :          if (useGPU) then
     819             :            ! store column tiles back to GPU
     820           0 :            cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
     821           0 :            if (my_pcol == cur_pcol) then
     822             :              successCUDA = cuda_memcpy2d((a_dev+        &
     823             :                                          int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
     824             :                                          int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
     825             :                                          int(lda*size_of_datatype,kind=c_intptr_t),           &
     826             :                                          int(lr_end*size_of_datatype,kind=c_intptr_t),        &
     827             :                                          int((lc_end - lc_start+1),kind=c_intptr_t), &
     828           0 :                                          int(cudaMemcpyHostToDevice,kind=c_int))
     829             : 
     830           0 :              if (.not.(successCUDA)) then
     831             :                print *, "bandred_&
     832             :                         &MATH_DATATYPE&
     833           0 :                         &: cuda memcpy a_dev  failed ", istat
     834           0 :                stop 1
     835             :              endif
     836             :            endif
     837             :          endif
     838             : 
     839             :          ! Calculate scalar products of stored Householder vectors.
     840             :          ! This can be done in different ways, we use dsyrk
     841             : 
     842      176112 :          vav = 0
     843      176112 :          call obj%timer%start("blas")
     844      176112 :          if (useGPU) then
     845           0 :            if (l_rows>0) &
     846             : #if REALCASE == 1
     847             :              call PRECISION_SYRK('U', 'T',            &
     848             : #endif
     849             : #if COMPLEXCASE == 1
     850             :              call PRECISION_HERK('U', 'C',            &
     851             : #endif
     852             :                            n_cols, l_rows, ONE, &
     853             :                            vmrCUDA, cur_l_rows, &
     854           0 :                            ZERO, vav, ubound(vav,dim=1))
     855             : 
     856             :          else ! useGPU
     857      176112 :            if (l_rows>0) &
     858             : #if REALCASE == 1
     859             :              call PRECISION_SYRK('U', 'T',           &
     860             : #endif
     861             : #if COMPLEXCASE == 1
     862             :              call PRECISION_HERK('U', 'C',           &
     863             : #endif
     864      176112 :                            n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
     865             :          endif
     866      176112 :          call obj%timer%stop("blas")
     867             : #if REALCASE == 1
     868             :          call symm_matrix_allreduce_&
     869             : #endif
     870             : #if COMPLEXCASE == 1
     871             :          call herm_matrix_allreduce_&
     872             : #endif
     873             :          &PRECISION &
     874      176112 :                          (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
     875             :          ! Calculate triangular matrix T for block Householder Transformation
     876      176112 :          call obj%timer%start("blas")
     877     6730560 :          do lc=n_cols,1,-1
     878     6554448 :            tau = tmat(lc,lc,istep)
     879     6554448 :            if (lc<n_cols) then
     880             :              call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
     881     6378336 :                                  n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
     882             : 
     883             : #if REALCASE == 1
     884     3328992 :              tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
     885             : #endif
     886             : #if COMPLEXCASE == 1
     887     3049344 :              tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
     888             : #endif
     889             :            endif
     890             :          enddo
     891      176112 :          call obj%timer%stop("blas")
     892             : #if REALCASE == 1
     893             :        endif !useQR
     894             : #endif
     895             :        ! Transpose vmr -> vmc (stored in umc, second half)
     896      176112 :        if (useGPU) then
     897             :          call elpa_transpose_vectors_&
     898             :               &MATH_DATATYPE&
     899             :               &_&
     900             :               &PRECISION &
     901             :                            (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
     902             :                             umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
     903           0 :                             mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
     904             :        else ! useGPU
     905             :          call elpa_transpose_vectors_&
     906             :               &MATH_DATATYPE&
     907             :               &_&
     908             :               &PRECISION &
     909             :                                            (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
     910             :                                             umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
     911      176112 :                                             1, istep*nbw, n_cols, nblk)
     912             :        endif
     913             : 
     914             :        ! Calculate umc = A**T * vmr
     915             :        ! Note that the distributed A has to be transposed
     916             :        ! Opposed to direct tridiagonalization there is no need to use the cache locality
     917             :        ! of the tiles, so we can use strips of the matrix
     918             : 
     919             : 
     920             : #if 0
     921             :        ! original complex implemetation check for performance
     922             :        umcCPU(1:l_cols,1:n_cols) = 0.0_rck
     923             :        vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
     924             : 
     925             :        if (l_cols>0 .and. l_rows>0) then
     926             :          do i=0,(istep*nbw-1)/tile_size
     927             : 
     928             :            lcs = i*l_cols_tile+1
     929             :            lce = min(l_cols,(i+1)*l_cols_tile)
     930             :            if (lce<lcs) cycle
     931             : 
     932             :            lre = min(l_rows,(i+1)*l_rows_tile)
     933             : 
     934             :              call obj%timer%start("blas")
     935             :              call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
     936             :                         vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
     937             :              call obj%timer%stop("blas")
     938             : 
     939             :            if (i==0) cycle
     940             :            lre = min(l_rows,i*l_rows_tile)
     941             :              call obj%timer%start("blas")
     942             :              call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
     943             :                         umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
     944             :              call obj%timer%stop("blas")
     945             :          enddo
     946             : 
     947             :        endif ! (l_cols>0 .and. l_rows>0)
     948             : #endif /* if 0 */
     949             : 
     950             :        !Code for Algorithm 4
     951             : 
     952             :        ! n_way is actually a branch for the number of OpenMP threads
     953      176112 :        n_way = 1
     954             : #ifdef WITH_OPENMP
     955             : 
     956             : #if REALCASE == 1
     957       35640 :        n_way = omp_get_max_threads()
     958             : 
     959       71280 :        !$omp parallel private( i,lcs,lce,lrs,lre)
     960             : #endif
     961       88056 :        if (n_way > 1) then
     962             : #if REALCASE == 1
     963           0 :          !$omp do
     964             : #endif
     965           0 :          do i=1,min(l_cols_tile, l_cols)
     966           0 :            umcCPU(i,1:n_cols) = 0.0_rck
     967             :          enddo
     968             : 
     969             : #if REALCASE == 1
     970           0 :          !$omp do
     971             : #endif
     972           0 :          do i=1,l_rows
     973           0 :            vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
     974             :          enddo
     975             : 
     976           0 :          if (l_cols>0 .and. l_rows>0) then
     977             : 
     978             :            !SYMM variant 4
     979             :            !Partitioned Matrix Expression:
     980             :            ! Ct = Atl Bt + Atr Bb
     981             :            ! Cb = Atr' Bt + Abl Bb
     982             :            !
     983             :            !Loop invariant:
     984             :            ! Ct = Atl Bt + Atr Bb
     985             :            !
     986             :            !Update:
     987             :            ! C1 = A10'B0 + A11B1 + A21 B2
     988             :            !
     989             :            !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
     990             :            !is easily parallelized, and regardless of choise of algorithm,
     991             :            !the startup cost for parallelizing the dgemms inside the loop is too great
     992             : #if REALCASE == 1
     993           0 :            !$omp do schedule(static,1)
     994             : #endif
     995           0 :            do i=0,(istep*nbw-1)/tile_size
     996           0 :              lcs = i*l_cols_tile+1                   ! local column start
     997           0 :              lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end
     998             : 
     999           0 :              lrs = i*l_rows_tile+1                   ! local row start
    1000           0 :              lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end
    1001             : 
    1002             :              !C1 += [A11 A12] [B1
    1003             :              !                 B2]
    1004           0 :              if ( lre > lrs .and. l_cols > lcs ) then
    1005           0 :                call obj%timer%start("blas")
    1006             :                call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1,          &
    1007             :                                    ONE, a(lrs,lcs), ubound(a,dim=1),                 &
    1008             :                                    umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1),  &
    1009           0 :                                    ZERO, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
    1010           0 :                call obj%timer%stop("blas")
    1011             :              endif
    1012             : 
    1013             :              ! C1 += A10' B0
    1014           0 :              if ( lce > lcs .and. i > 0 ) then
    1015           0 :                call obj%timer%start("blas")
    1016             :                call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',     &
    1017             :                         lce-lcs+1, n_cols, lrs-1,              &
    1018             :                                     ONE, a(1,lcs),   ubound(a,dim=1),      &
    1019             :                                     vmrCPU(1,1),   ubound(vmrCPU,dim=1),   &
    1020           0 :                                     ZERO, umcCPU(lcs,1), ubound(umcCPU,dim=1))
    1021           0 :                call obj%timer%stop("blas")
    1022             :              endif
    1023             :            enddo
    1024             :          endif ! l_cols>0 .and. l_rows>0
    1025             : 
    1026             :       else ! n_way > 1
    1027             : #endif /* WITH_OPENMP */
    1028             : 
    1029      176112 :         if (useGPU) then
    1030           0 :           umcCUDA(1 : l_cols * n_cols) = 0.0_rck
    1031           0 :           vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0.0_rck
    1032             :         else ! useGPU
    1033     1876248 :           umcCPU(1:l_cols,1:n_cols) = 0.0_rck
    1034      176112 :           vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
    1035             :         endif ! useGPU
    1036             : 
    1037      176112 :         if (l_cols>0 .and. l_rows>0) then
    1038             : 
    1039      176112 :           if (useGPU) then
    1040             :             successCUDA = cuda_memcpy(vmr_dev,        &
    1041             :                                        loc(vmrCUDA(1)),&
    1042           0 :                                        vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
    1043           0 :             if (.not.(successCUDA)) then
    1044             :               print *,"bandred_&
    1045             :                       &MATH_DATATYPE&
    1046           0 :                       &: error in cudaMemcpy vmr_dev 3"
    1047           0 :               stop 1
    1048             :             endif
    1049             : 
    1050             :             successCUDA = cuda_memcpy(umc_dev,    &
    1051             :                                       loc(umcCUDA(1)), &
    1052           0 :                                       umc_size*size_of_datatype,cudaMemcpyHostToDevice)
    1053           0 :             if (.not.(successCUDA)) then
    1054             :               print *,"bandred_&
    1055             :                       &MATH_DATATYPE&
    1056           0 :                       &: error in cudaMemcpy umc_dev 3"
    1057           0 :               stop 1
    1058             :             endif
    1059             :           endif ! useGPU
    1060             : 
    1061      462048 :           do i=0,(istep*nbw-1)/tile_size
    1062             : 
    1063      285936 :             lcs = i*l_cols_tile+1
    1064      303408 :             lce = min(l_cols,(i+1)*l_cols_tile)
    1065      285936 :             if (lce<lcs) cycle
    1066      303408 :             lre = min(l_rows,(i+1)*l_rows_tile)
    1067             : 
    1068      285936 :             if (useGPU) then
    1069           0 :               call obj%timer%start("cublas")
    1070             :               call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',                   &
    1071             :                                          lce-lcs+1, n_cols, lre,     &
    1072             :                                          ONE, (a_dev + ((lcs-1)*lda* &
    1073             :                                          size_of_datatype)),         &
    1074             :                                          lda, vmr_dev,cur_l_rows,    &
    1075             :                                          ONE, (umc_dev+ (lcs-1)*     &
    1076             :                                              size_of_datatype),      &
    1077           0 :                                          cur_l_cols)
    1078             : 
    1079           0 :               call obj%timer%stop("cublas")
    1080             : 
    1081           0 :               if(i==0) cycle
    1082           0 :               call obj%timer%start("cublas")
    1083             : 
    1084           0 :               lre = min(l_rows,i*l_rows_tile)
    1085             :               call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1, ONE, &
    1086             :                                           (a_dev+ ((lcs-1)*lda*                 &
    1087             :                                                 size_of_datatype)),             &
    1088             :                                      lda, (umc_dev+(cur_l_cols * n_cols+lcs-1)* &
    1089             :                                             size_of_datatype),              &
    1090             :                                             cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* &
    1091             :                                           size_of_datatype),              &
    1092           0 :                                             cur_l_rows)
    1093           0 :               call obj%timer%stop("cublas")
    1094             :             else ! useGPU
    1095             : 
    1096      285936 :               call obj%timer%start("blas")
    1097             :               call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',          &
    1098             :                              lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
    1099      285936 :                                    vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
    1100      285936 :               call obj%timer%stop("blas")
    1101      285936 :               if (i==0) cycle
    1102      127296 :               lre = min(l_rows,i*l_rows_tile)
    1103      109824 :               call obj%timer%start("blas")
    1104             :               call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
    1105             :                                      umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE,      &
    1106      109824 :                                      vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
    1107      109824 :               call obj%timer%stop("blas")
    1108             :             endif ! useGPU
    1109             :           enddo ! i=0,(istep*nbw-1)/tile_size
    1110             : 
    1111      176112 :           if (useGPU) then
    1112             :             successCUDA = cuda_memcpy(     &
    1113             :                                   loc(vmrCUDA(1)),    &
    1114           0 :                                    vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
    1115           0 :             if (.not.(successCUDA)) then
    1116             :               print *,"bandred_&
    1117             :                       &MATH_DATATYPE&
    1118           0 :                       &: error in cudaMemcpy vmr_dev 4"
    1119           0 :               stop 1
    1120             :             endif
    1121             : 
    1122             :             successCUDA = cuda_memcpy(    &
    1123             :                                 loc(umcCUDA(1)),    &
    1124           0 :                                 umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
    1125           0 :             if (.not.(successCUDA)) then
    1126             :               print *,"bandred_&
    1127             :               &MATH_DATATYPE&
    1128           0 :               &: error in cudaMemcpy umc_dev 4"
    1129           0 :               stop 1
    1130             :             endif
    1131             :           endif ! useGPU
    1132             :         endif ! l_cols>0 .and. l_rows>0
    1133             : 
    1134             : #ifdef WITH_OPENMP
    1135             :       endif ! n_way > 1
    1136             : #if REALCASE == 1
    1137             :       !$omp end parallel
    1138             : #endif
    1139             : #endif
    1140             :        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    1141             :        ! on the processors containing the diagonal
    1142             :        ! This is only necessary if ur has been calculated, i.e. if the
    1143             :        ! global tile size is smaller than the global remaining matrix
    1144             : 
    1145             :        ! Or if we used the Algorithm 4
    1146      176112 :        if (tile_size < istep*nbw .or. n_way > 1) then
    1147             : 
    1148       41472 :          if (useGPU) then
    1149             : 
    1150             :            call elpa_reduce_add_vectors_&
    1151             :                 &MATH_DATATYPE&
    1152             :                 &_&
    1153             :                 &PRECISION &
    1154             :                                 (obj, vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,  &
    1155             :                                  mpi_comm_rows, umcCUDA,                            &
    1156           0 :                                  cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk)
    1157             :          else ! useGPU
    1158             : 
    1159             :            call elpa_reduce_add_vectors_&
    1160             :            &MATH_DATATYPE&
    1161             :            &_&
    1162             :            &PRECISION &
    1163             :                                             (obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
    1164             :                                              umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
    1165       41472 :                                              istep*nbw, n_cols, nblk)
    1166             :          endif ! useGPU
    1167             :        endif ! tile_size < istep*nbw .or. n_way > 1
    1168             : 
    1169      176112 :        if (l_cols>0) then
    1170             : 
    1171      176112 :          if (useGPU) then
    1172             : #ifdef WITH_MPI
    1173           0 :            allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
    1174           0 :            if (istat .ne. 0) then
    1175             :              print *,"bandred_&
    1176             :                       &MATH_DATATYPE&
    1177           0 :                       &: error when allocating tmpCUDA "//errorMessage
    1178           0 :              stop 1
    1179             :            endif
    1180             : 
    1181           0 :            if (wantDebug) call obj%timer%start("mpi_communication")
    1182             : 
    1183             :            call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_MATH_DATATYPE_PRECISION, &
    1184           0 :                               MPI_SUM, mpi_comm_rows, ierr)
    1185             : 
    1186           0 :            umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
    1187           0 :            if (wantDebug) call obj%timer%stop("mpi_communication")
    1188             : #else /* WITH_MPI */
    1189             : 
    1190             :            ! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
    1191             : 
    1192             : #endif /* WITH_MPI */
    1193             : 
    1194           0 :            if (allocated(tmpCUDA)) then
    1195           0 :              deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
    1196           0 :              if (istat .ne. 0) then
    1197             :                print *,"bandred_&
    1198             :                        &MATH_DATATYPE&
    1199           0 :                        &: error when deallocating tmpCUDA "//errorMessage
    1200           0 :                stop 1
    1201             :              endif
    1202             :            endif
    1203             : 
    1204             :          else ! useGPU
    1205             : 
    1206      176112 :            allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
    1207      176112 :            if (istat .ne. 0) then
    1208             :              print *,"bandred_&
    1209             :                      &MATH_DATATYPE&
    1210           0 :                      &: error when allocating tmpCPU "//errorMessage
    1211           0 :              stop 1
    1212             :            endif
    1213             : 
    1214             : #ifdef WITH_MPI
    1215      117408 :            if (wantDebug) call obj%timer%start("mpi_communication")
    1216             :            call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_MATH_DATATYPE_PRECISION,    &
    1217      117408 :             MPI_SUM, mpi_comm_rows, mpierr)
    1218      117408 :            umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
    1219      117408 :            if (wantDebug) call obj%timer%stop("mpi_communication")
    1220             : #else /* WITH_MPI */
    1221             : !           tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
    1222             : #endif /* WITH_MPI */
    1223             : 
    1224      176112 :            deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
    1225      176112 :            if (istat .ne. 0) then
    1226             :              print *,"bandred_&
    1227             :                      &MATH_DATATYPE&
    1228           0 :                      &: error when deallocating tmpCPU "//errorMessage
    1229           0 :              stop 1
    1230             :            endif
    1231             :          endif ! useGPU
    1232             :        endif ! l_cols > 0
    1233             : 
    1234             :        ! U = U * Tmat**T
    1235             : 
    1236      176112 :        if (useGPU) then
    1237             :          successCUDA = cuda_memcpy(umc_dev,    &
    1238             :                                    loc(umcCUDA(1)),   &
    1239           0 :                                    umc_size*size_of_datatype, cudaMemcpyHostToDevice)
    1240           0 :          if (.not.(successCUDA)) then
    1241             :            print *,"bandred_&
    1242             :                    &MATH_DATATYPE&
    1243           0 :                    &: error in cudaMemcpy umc_dev 5"
    1244           0 :            stop 1
    1245             :          endif
    1246           0 :          successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
    1247           0 :          if (.not.(successCUDA)) then
    1248             :            print *,"bandred_&
    1249             :                    &MATH_DATATYPE&
    1250           0 :                    &: error in cudaMemcpy tmat_dev 2"
    1251           0 :            stop 1
    1252             :          endif
    1253             : 
    1254           0 :          call obj%timer%start("cublas")
    1255             :          call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',  &
    1256           0 :                                l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
    1257           0 :          call obj%timer%stop("cublas")
    1258             : 
    1259             :          ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
    1260           0 :          successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
    1261           0 :          if (.not.(successCUDA)) then
    1262             :            print *,"bandred_&
    1263             :                    &MATH_DATATYPE&
    1264           0 :                    &: error in cudaMemcpy vav_dev 2"
    1265           0 :            stop 1
    1266             :          endif
    1267           0 :          call obj%timer%start("cublas")
    1268             : 
    1269             :          call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',             &
    1270             :                                     n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
    1271             :                                     (umc_dev+(cur_l_cols * n_cols )*size_of_datatype),cur_l_cols, &
    1272           0 :                                     ZERO, vav_dev, nbw)
    1273             : 
    1274             :          call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',    &
    1275           0 :                             n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
    1276           0 :          call obj%timer%stop("cublas")
    1277             : 
    1278           0 :          successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
    1279           0 :          if (.not.(successCUDA)) then
    1280             :            print *,"bandred_&
    1281             :                    &MATH_DATATYPE&
    1282           0 :                    &: error in cudaMemcpy vav_dev3"
    1283           0 :            stop 1
    1284             :          endif
    1285             :        else ! useGPU
    1286             : 
    1287      176112 :          call obj%timer%start("blas")
    1288             : 
    1289             :          call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',     &
    1290             :                         l_cols,n_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), &
    1291      176112 :                               umcCPU, ubound(umcCPU,dim=1))
    1292             : 
    1293             :          ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
    1294             : 
    1295             :          call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',              &
    1296             :                        n_cols, n_cols, l_cols, ONE, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
    1297      176112 :                              ubound(umcCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
    1298             : 
    1299             :          call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',    &
    1300             :                        n_cols, n_cols, ONE, tmat(1,1,istep),    &
    1301      176112 :                              ubound(tmat,dim=1), vav, ubound(vav,dim=1))
    1302      176112 :          call obj%timer%stop("blas")
    1303             : 
    1304             :        endif ! useGPU
    1305             : 
    1306             : #if REALCASE == 1
    1307             :        call symm_matrix_allreduce_&
    1308             : #endif
    1309             : #if COMPLEXCASE == 1
    1310             :        call herm_matrix_allreduce_&
    1311             : #endif
    1312             :             &PRECISION &
    1313      176112 :                               (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
    1314             : 
    1315      176112 :        if (useGPU) then
    1316           0 :          successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
    1317           0 :          if (.not.(successCUDA)) then
    1318             :            print *,"bandred_&
    1319             :            &MATH_DATATYPE&
    1320           0 :            &: error in cudaMemcpy vav_dev4"
    1321           0 :            stop 1
    1322             :          endif
    1323             :        endif
    1324             : 
    1325             :        ! U = U - 0.5 * V * VAV
    1326             : 
    1327      176112 :        if (useGPU) then
    1328           0 :          call obj%timer%start("cublas")
    1329             : 
    1330             :          call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
    1331             : #if REALCASE == 1
    1332             :                                     -0.5_rk,                      &
    1333             : #endif
    1334             : #if COMPLEXCASE == 1
    1335             :                                     (-0.5_rk, 0.0_rk), &
    1336             : #endif
    1337             :                                     (umc_dev+(cur_l_cols * n_cols )* &
    1338             :                                     size_of_datatype),   &
    1339             :                                     cur_l_cols, vav_dev,nbw,        &
    1340           0 :                                     ONE, umc_dev, cur_l_cols)
    1341           0 :          call obj%timer%stop("cublas")
    1342             : 
    1343             :          successCUDA = cuda_memcpy(      &
    1344             :                                    loc(umcCUDA(1)),    &
    1345           0 :                                    umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
    1346             : 
    1347           0 :          if (.not.(successCUDA)) then
    1348             :            print *,"bandred_&
    1349             :                    &MATH_DATATYPE&
    1350           0 :                    &: error in cudaMemcpy umc_dev 6"
    1351           0 :            stop 1
    1352             :          endif
    1353             : 
    1354             :          ! Transpose umc -> umr (stored in vmr, second half)
    1355             :          call elpa_transpose_vectors_&
    1356             :               &MATH_DATATYPE&
    1357             :               &_&
    1358             :               &PRECISION &
    1359             :                           (obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
    1360             :                            vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
    1361           0 :                            1, istep*nbw, n_cols, nblk)
    1362             : 
    1363             :          successCUDA = cuda_memcpy(vmr_dev,       &
    1364             :                                    loc(vmrCUDA(1)),    &
    1365           0 :                                    vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
    1366           0 :          if (.not.(successCUDA)) then
    1367             :            print *,"bandred_&
    1368             :                    &MATH_DATATYPE&
    1369           0 :                    &: error in cudaMemcpy vmr_dev 5 "
    1370           0 :            stop 1
    1371             :          endif
    1372             : 
    1373             :          successCUDA = cuda_memcpy(umc_dev,     &
    1374             :                                    loc(umcCUDA(1)),    &
    1375           0 :                                    umc_size*size_of_datatype, cudaMemcpyHostToDevice)
    1376           0 :          if (.not.(successCUDA)) then
    1377             :            print *,"bandred_&
    1378             :                    &MATH_DATATYPE&
    1379           0 :                    &: error in cudaMemcpy umc_dev 7"
    1380           0 :            stop 1
    1381             :          endif
    1382             :        else ! useGPU
    1383      176112 :          call obj%timer%start("blas")
    1384             :          call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,     &
    1385             : #if REALCASE == 1
    1386             :                        -0.5_rk,                           &
    1387             : #endif
    1388             : #if COMPLEXCASE == 1
    1389             :                               (-0.5_rk, 0.0_rk),     &
    1390             : #endif
    1391             :             umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
    1392      176112 :                               ubound(vav,dim=1), ONE, umcCPU, ubound(umcCPU,dim=1))
    1393             : 
    1394      176112 :          call obj%timer%stop("blas")
    1395             :          ! Transpose umc -> umr (stored in vmr, second half)
    1396             :          call elpa_transpose_vectors_&
    1397             :          &MATH_DATATYPE&
    1398             :          &_&
    1399             :          &PRECISION &
    1400             :                                   (obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
    1401             :                                          vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
    1402      176112 :                                          1, istep*nbw, n_cols, nblk)
    1403             : 
    1404             :        endif  ! useGPU
    1405             : 
    1406             : 
    1407             :        ! A = A - V*U**T - U*V**T
    1408             : 
    1409             : #ifdef WITH_OPENMP
    1410      176112 :        !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
    1411       88056 :        n_threads = omp_get_num_threads()
    1412       88056 :        if (mod(n_threads, 2) == 0) then
    1413           0 :          n_way = 2
    1414             :        else
    1415       88056 :          n_way = 1
    1416             :        endif
    1417             : 
    1418       88056 :        m_way = n_threads / n_way
    1419             : 
    1420       88056 :        m_id = mod(omp_get_thread_num(),  m_way)
    1421       88056 :        n_id = omp_get_thread_num() / m_way
    1422             : 
    1423      319080 :        do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
    1424      142968 :          i = ii / tile_size
    1425      142968 :          lcs = i*l_cols_tile+1
    1426      197880 :          lce = min(l_cols,(i+1)*l_cols_tile)
    1427      197880 :          lre = min(l_rows,(i+1)*l_rows_tile)
    1428      285936 :          if (lce<lcs .or. lre<1) cycle
    1429             : 
    1430             :          !Figure out this thread's range
    1431      142968 :          work_per_thread = lre / m_way
    1432      142968 :          if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
    1433      142968 :          mystart = m_id * work_per_thread + 1
    1434      142968 :          myend   = mystart + work_per_thread - 1
    1435      142968 :          if ( myend > lre ) myend = lre
    1436      142968 :          if ( myend-mystart+1 < 1) cycle
    1437      142968 :          call obj%timer%start("blas")
    1438             :          call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, lce-lcs+1, 2*n_cols, -ONE, &
    1439             :                     vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
    1440      142968 :                      ONE, a(mystart,lcs), ubound(a,1))
    1441      142968 :           call obj%timer%stop("blas")
    1442             :        enddo
    1443             :        !$omp end parallel
    1444             : !#if COMPLEXCASE == 1
    1445             : !       do i=0,(istep*nbw-1)/tile_size
    1446             : !         lcs = i*l_cols_tile+1
    1447             : !         lce = min(l_cols,(i+1)*l_cols_tile)
    1448             : !         lre = min(l_rows,(i+1)*l_rows_tile)
    1449             : !         if (lce<lcs .or. lre<1) cycle
    1450             : !         call obj%timer%start("blas")
    1451             : !         call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -ONE, &
    1452             : !                       vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
    1453             : !                       ONE, a(1,lcs), lda)
    1454             : !         call obj%timer%stop("blas")
    1455             : !       enddo
    1456             : !#endif
    1457             : 
    1458             : #else /* WITH_OPENMP */
    1459             : 
    1460      231024 :        do i=0,(istep*nbw-1)/tile_size
    1461      142968 :          lcs = i*l_cols_tile+1
    1462      142968 :          lce = min(l_cols,(i+1)*l_cols_tile)
    1463      142968 :          lre = min(l_rows,(i+1)*l_rows_tile)
    1464      142968 :          if (lce<lcs .or. lre<1) cycle
    1465             : 
    1466      142968 :          if (useGPU) then
    1467           0 :            call obj%timer%start("cublas")
    1468             : 
    1469             :            call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
    1470             :                                       lre, lce-lcs+1, 2*n_cols, -ONE, &
    1471             :                                       vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*  &
    1472             :                                       size_of_datatype), &
    1473             :                                       cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
    1474           0 :                                       size_of_datatype), lda)
    1475           0 :            call obj%timer%stop("cublas")
    1476             : 
    1477             :          else ! useGPU
    1478             : 
    1479      142968 :            call obj%timer%start("blas")
    1480             :            call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre,lce-lcs+1, 2*n_cols, -ONE, &
    1481             :                                vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
    1482      142968 :                                ONE, a(1,lcs), lda)
    1483      142968 :            call obj%timer%stop("blas")
    1484             :          endif ! useGPU
    1485             :        enddo ! i=0,(istep*nbw-1)/tile_size
    1486             : #endif /* WITH_OPENMP */
    1487             : 
    1488      176112 :        if (.not.(useGPU)) then
    1489      176112 :          if (allocated(vr)) then
    1490      176112 :            deallocate(vr, stat=istat, errmsg=errorMessage)
    1491      176112 :            if (istat .ne. 0) then
    1492             :              print *,"bandred_&
    1493             :                      &MATH_DATATYPE&
    1494           0 :                      &: error when deallocating vr "//errorMessage
    1495           0 :              stop 1
    1496             :            endif
    1497             :          endif
    1498             : 
    1499      176112 :          if (allocated(umcCPU)) then
    1500      176112 :            deallocate(umcCPU, stat=istat, errmsg=errorMessage)
    1501      176112 :            if (istat .ne. 0) then
    1502             :              print *,"bandred_&
    1503             :                      &MATH_DATATYPE&
    1504           0 :                      &: error when deallocating umcCPU "//errorMessage
    1505           0 :              stop 1
    1506             :            endif
    1507             :          endif
    1508             : 
    1509      176112 :          if (allocated(vmrCPU)) then
    1510      176112 :            deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
    1511      176112 :            if (istat .ne. 0) then
    1512             :              print *,"bandred_&
    1513             :                      &MATH_DATATYPE&
    1514           0 :                      &: error when deallocating vmrCPU "//errorMessage
    1515           0 :              stop 1
    1516             :            endif
    1517             :          endif
    1518             :        endif !useGPU
    1519             : 
    1520             :      enddo ! istep
    1521             : 
    1522       46584 :      if (useGPU) then
    1523           0 :        successCUDA = cuda_free(vav_dev)
    1524           0 :        if (.not.(successCUDA)) then
    1525             :          print *,"bandred_&
    1526             :                  &MATH_DATATYPE&
    1527           0 :                  &: error in cudaFree vav_dev 4"
    1528           0 :          stop 1
    1529             :        endif
    1530             :      endif ! useGPU
    1531             : 
    1532       46584 :      if (allocated(vr)) then
    1533           0 :        deallocate(vr, stat=istat, errmsg=errorMessage)
    1534           0 :        if (istat .ne. 0) then
    1535             :          print *,"bandred_&
    1536             :                  &MATH_DATATYPE&
    1537           0 :                  &: error when deallocating vr "//errorMessage
    1538           0 :          stop 1
    1539             :        endif
    1540             :      endif
    1541             : 
    1542       46584 :      if (allocated(umcCPU)) then
    1543           0 :        deallocate(umcCPU, stat=istat, errmsg=errorMessage)
    1544           0 :        if (istat .ne. 0) then
    1545             :          print *,"bandred_&
    1546             :                  &MATH_DATATYPE&
    1547           0 :                  &: error when deallocating umcCPU "//errorMessage
    1548           0 :          stop 1
    1549             :        endif
    1550             :      endif
    1551             : 
    1552       46584 :      if (allocated(vmrCPU)) then
    1553           0 :        deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
    1554           0 :        if (istat .ne. 0) then
    1555             :          print *,"bandred_&
    1556             :                  &MATH_DATATYPE&
    1557           0 :                  &: error when deallocating vmrCPU "//errorMessage
    1558           0 :          stop 1
    1559             :        endif
    1560             :      endif
    1561             : 
    1562             : !#if COMPLEXCASE == 1
    1563             : !       ! check this
    1564             : !       if (useGPU) then
    1565             : !         successCUDA = cuda_free(umc_dev)
    1566             : !         if (.not.(successCUDA)) then
    1567             : !           print *,"bandred_complex: error in cudaFree umc_dev 7a"
    1568             : !           stop
    1569             : !         endif
    1570             : !       endif
    1571             : !#endif
    1572             : 
    1573       46584 :      if (useGPU) then
    1574           0 :        successCUDA = cuda_free(vmr_dev)
    1575           0 :        if (.not.(successCUDA)) then
    1576             :          print *,"bandred_&
    1577             :                  &MATH_DATATYPE&
    1578           0 :                  &: error in cudaFree vmr_dev 6"
    1579           0 :          stop 1
    1580             :        endif
    1581             : 
    1582           0 :        successCUDA = cuda_free(umc_dev)
    1583           0 :        if (.not.(successCUDA)) then
    1584             :          print *,"bandred_&
    1585             :                  &MATH_DATATYPE&
    1586           0 :                  &: error in cudaFree umc_dev 8"
    1587           0 :          stop
    1588             :        endif
    1589             : 
    1590           0 :        if (allocated(umcCUDA)) then
    1591           0 :          deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
    1592           0 :          if (istat .ne. 0) then
    1593             :            print *,"bandred_&
    1594             :                    &MATH_DATATYPE&
    1595           0 :                    &: error when deallocating umcCUDA "//errorMessage
    1596           0 :            stop 1
    1597             :          endif
    1598             :        endif
    1599           0 :        if (allocated(vmrCUDA)) then
    1600           0 :          deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
    1601           0 :          if (istat .ne. 0) then
    1602             :            print *,"bandred_&
    1603             :                    &MATH_DATATYPE&
    1604           0 :                    &: error when deallocating vmrCUDA "//errorMessage
    1605           0 :            stop 1
    1606             :          endif
    1607             :        endif
    1608             :      endif ! useGPU
    1609             : 
    1610             : #if REALCASE == 1
    1611       28152 :      if (useQR) then
    1612           0 :        if (which_qr_decomposition == 1) then
    1613           0 :          deallocate(work_blocked, stat=istat, errmsg=errorMessage)
    1614           0 :          if (istat .ne. 0) then
    1615           0 :            print *,"bandred_real: error when deallocating work_blocked "//errorMessage
    1616           0 :            stop 1
    1617             :          endif
    1618             : 
    1619           0 :          deallocate(tauvector, stat=istat, errmsg=errorMessage)
    1620           0 :          if (istat .ne. 0) then
    1621           0 :            print *,"bandred_real: error when deallocating tauvector "//errorMessage
    1622           0 :            stop 1
    1623             :          endif
    1624             :        endif
    1625             :      endif
    1626             : #endif
    1627             : 
    1628             :      call obj%timer%stop("bandred_&
    1629             :      &MATH_DATATYPE&
    1630             :      &" // &
    1631             :      &PRECISION_SUFFIX &
    1632       46584 :      )
    1633             : 
    1634             :    end subroutine bandred_&
    1635             :    &MATH_DATATYPE&
    1636             :    &_&
    1637      139752 :    &PRECISION
    1638             : #if REALCASE == 1
    1639             :    ! slower for gpu on 10000 10000 ???
    1640             : #endif
    1641             : 

Generated by: LCOV version 1.12