LCOV - code coverage report
Current view: top level - src/elpa2/qr - elpa_pdgeqrf_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 0 839 0.0 %
Date: 2018-01-10 09:29:53 Functions: 0 41 0.0 %

          Line data    Source code
       1             : !    This file is part of ELPA.
       2             : !
       3             : !    The ELPA library was originally created by the ELPA consortium,
       4             : !    consisting of the following organizations:
       5             : !
       6             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : !      Informatik,
      10             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : !      and
      16             : !    - IBM Deutschland GmbH
      17             : !
      18             : !
      19             : !    More information can be found here:
      20             : !    http://elpa.mpcdf.mpg.de/
      21             : !
      22             : !    ELPA is free software: you can redistribute it and/or modify
      23             : !    it under the terms of the version 3 of the license of the
      24             : !    GNU Lesser General Public License as published by the Free
      25             : !    Software Foundation.
      26             : !
      27             : !    ELPA is distributed in the hope that it will be useful,
      28             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      29             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      30             : !    GNU Lesser General Public License for more details.
      31             : !
      32             : !    You should have received a copy of the GNU Lesser General Public License
      33             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      34             : !
      35             : !    ELPA reflects a substantial effort on the part of the original
      36             : !    ELPA consortium, and we ask you to respect the spirit of the
      37             : !    license that we chose: i.e., please contribute any changes you
      38             : !    may have back to the original ELPA library distribution, and keep
      39             : !    any derivatives of ELPA under the same license that we chose for
      40             : !    the original distribution, the GNU Lesser General Public License.
      41             : !
      42             :      subroutine qr_pdgeqrf_2dcomm_&
      43           0 :            &PRECISION &
      44           0 :            (obj, a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, &
      45           0 :                                   work, workLength, lwork, m, n, mb, nb, rowidx, colidx, &
      46             :                                   rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic)
      47             :       use precision
      48             :       use elpa1_impl
      49             :       use qr_utils_mod
      50             :       use elpa_abstract_impl
      51             :       implicit none
      52             : 
      53             :       class(elpa_abstract_impl_t), intent(inout) :: obj
      54             :       ! parameter setup
      55             :       INTEGER(kind=ik), parameter   :: gmode_ = 1, rank_ = 2, eps_ = 3
      56             : 
      57             :       ! input variables (local)
      58             :       integer(kind=ik), intent(in)  :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, &
      59             :                                        colsT, workLength
      60             : 
      61             :       ! input variables (global)
      62             :       integer(kind=ik)              :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows
      63             : #ifdef USE_ASSUMED_SIZE_QR
      64             :       integer(kind=ik)              :: PQRPARAM(*)
      65             :       real(kind=C_DATATYPE_KIND)                 :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*)
      66             : #else
      67             :       integer(kind=ik)              :: PQRPARAM(1:11)
      68             :       real(kind=C_DATATYPE_KIND)                 :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), &
      69             :                                        t(1:ldt,1:colsT), work(1:workLength)
      70             : #endif
      71             :       ! output variables (global)
      72             :       real(kind=C_DATATYPE_KIND)                 :: blockheuristic(*)
      73             : 
      74             :       ! input variables derived from PQRPARAM
      75             :       integer(kind=ik)              :: updatemode,tmerge,size2d
      76             : 
      77             :       ! local scalars
      78             :       integer(kind=ik)              :: mpierr,mpirank_cols,broadcast_size,mpirank_rows
      79             :       integer(kind=ik)              :: mpirank_cols_qr,mpiprocs_cols
      80             :       integer(kind=ik)              :: lcols_temp,lcols,icol,lastcol
      81             :       integer(kind=ik)              :: baseoffset,offset,idx,voffset
      82             :       integer(kind=ik)              :: update_voffset,update_tauoffset
      83             :       integer(kind=ik)              :: update_lcols
      84             :       integer(kind=ik)              :: work_offset
      85             : 
      86             :       real(kind=C_DATATYPE_KIND)                 :: dbroadcast_size(1),dtmat_bcast_size(1)
      87             :       real(kind=C_DATATYPE_KIND)                 :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
      88             :       integer(kind=ik)              :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
      89             :       integer(kind=ik)              :: remaining_cols
      90             :       integer(kind=ik)              :: total_cols
      91             :       integer(kind=ik)              :: incremental_update_size ! needed for incremental update mode
      92             : 
      93             :       call obj%timer%start("qr_pdgeqrf_2dcomm_&
      94             :           &PRECISION&
      95           0 :           &")
      96           0 :       size2d     = PQRPARAM(1)
      97           0 :       updatemode = PQRPARAM(2)
      98           0 :       tmerge     = PQRPARAM(3)
      99             : 
     100             :       ! copy value before we are going to filter it
     101           0 :       total_cols = n
     102           0 :       call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
     103           0 :       call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
     104           0 :       call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
     105             : 
     106             : #ifdef USE_ASSUMED_SIZE_QR
     107             :       call qr_pdgeqrf_1dcomm_&
     108             :           &PRECISION &
     109             :           (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
     110             :                              PQRPARAM(4),mpicomm_rows,blockheuristic)
     111             : #else
     112             :       call qr_pdgeqrf_1dcomm_&
     113             :           &PRECISION &
     114             :           (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
     115           0 :                              PQRPARAM(4:11),mpicomm_rows,blockheuristic)
     116             : #endif
     117             :       call qr_pdgeqrf_pack_unpack_&
     118             :           &PRECISION &
     119           0 :           (obj,v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
     120             :       call qr_pdgeqrf_pack_unpack_tmatrix_&
     121             :           &PRECISION &
     122           0 :           (obj,tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
     123             : 
     124             : #ifdef DOUBLE_PRECISION_REAL
     125           0 :       pdlarft_size(1) = 0.0_rk8
     126             : #else
     127           0 :       pdlarft_size(1) = 0.0_rk4
     128             : #endif
     129             : 
     130             :       call qr_pdlarfb_1dcomm_&
     131             :           &PRECISION &
     132             :           (m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
     133           0 :                              pdlarfb_size(1),-1)
     134             :       call qr_tmerge_pdlarfb_1dcomm_&
     135             :           &PRECISION &
     136             :           (m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
     137           0 :                                     mpicomm_rows,tmerge_pdlarfb_size(1),-1)
     138             : 
     139             : 
     140           0 :       temptau_offset = 1
     141           0 :       temptau_size = total_cols
     142           0 :       broadcast_offset = temptau_offset + temptau_size
     143           0 :       broadcast_size = int(dbroadcast_size(1) + dtmat_bcast_size(1))
     144           0 :       work_offset = broadcast_offset + broadcast_size
     145             : 
     146           0 :       if (lwork .eq. -1) then
     147             : #ifdef DOUBLE_PRECISION_REAL
     148             :         work(1) = (real(temptau_size,kind=C_DATATYPE_KIND) + real(broadcast_size,kind=C_DATATYPE_KIND) + max(pdgeqrf_size(1), &
     149             :             pdlarft_size(1),pdlarfb_size(1), &
     150           0 :                    tmerge_pdlarfb_size(1)))
     151             : #else
     152             :         work(1) = (real(temptau_size,kind=rk4) + real(broadcast_size,kind=rk4) + max(pdgeqrf_size(1), &
     153             :             pdlarft_size(1),pdlarfb_size(1), &
     154           0 :                    tmerge_pdlarfb_size(1)))
     155             : #endif
     156             :         call obj%timer%stop("qr_pdgeqrf_2dcomm_&
     157             :           &PRECISION&
     158           0 :           &")
     159           0 :         return
     160             :       end if
     161             : 
     162           0 :       lastcol = colidx-total_cols+1
     163           0 :       voffset = total_cols
     164             : 
     165           0 :       incremental_update_size = 0
     166             : 
     167             :       ! clear v buffer: just ensure that there is no junk in the upper triangle
     168             :       ! part, otherwise pdlarfb gets some problems
     169             :       ! pdlarfl(2) do not have these problems as they are working more on a Vector
     170             :       ! basis
     171             : #ifdef DOUBLE_PRECISION_REAL
     172           0 :       v(1:ldv,1:total_cols) = 0.0_rk8
     173             : #else
     174           0 :       v(1:ldv,1:total_cols) = 0.0_rk4
     175             : #endif
     176           0 :       icol = colidx
     177             : 
     178           0 :       remaining_cols = total_cols
     179             : 
     180             :       !print *,'start decomposition',m,rowidx,colidx
     181             : 
     182           0 :       do while (remaining_cols .gt. 0)
     183             : 
     184             :         ! determine rank of process column with next qr block
     185           0 :         mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)
     186             : 
     187             :         ! lcols can't be larger than than nb
     188             :         ! exception: there is only one process column
     189             : 
     190             :         ! however, we might not start at the first local column.
     191             :         ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
     192             :         ! determine the real amount of local columns
     193           0 :         lcols_temp = min(nb,(icol-lastcol+1))
     194             : 
     195             :         ! blocking parameter
     196           0 :         lcols_temp = max(min(lcols_temp,size2d),1)
     197             : 
     198             :         ! determine size from last decomposition column
     199             :         !  to first decomposition column
     200             :         call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
     201             :                                       mpirank_cols_qr,mpiprocs_cols, &
     202           0 :                                       lcols,baseoffset,offset)
     203             : 
     204           0 :         voffset = remaining_cols - lcols + 1
     205             : 
     206           0 :         idx = rowidx - colidx + icol
     207             : 
     208           0 :         if (mpirank_cols .eq. mpirank_cols_qr) then
     209             :           ! qr decomposition part
     210             : #ifdef DOUBLE_PRECISION_REAL
     211           0 :           tau(offset:offset+lcols-1) = 0.0_rk8
     212             : #else
     213           0 :           tau(offset:offset+lcols-1) = 0.0_rk4
     214             : #endif
     215             : 
     216             : #ifdef USE_ASSUMED_SIZE_QR
     217             :           call qr_pdgeqrf_1dcomm_&
     218             :               &PRECISION &
     219             :               (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
     220             :                                  work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
     221             :                                  mpicomm_rows,blockheuristic)
     222             : 
     223             : #else
     224             :           call qr_pdgeqrf_1dcomm_&
     225             :               &PRECISION &
     226             :               (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
     227             :                                  work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), &
     228           0 :                                  mpicomm_rows,blockheuristic)
     229             : #endif
     230             : 
     231             :           ! pack broadcast buffer (v + tau)
     232             :           call qr_pdgeqrf_pack_unpack_&
     233             :               &PRECISION &
     234             :               (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
     235           0 :                                       idx,rev,0,mpicomm_rows)
     236             : 
     237             :           ! determine broadcast size
     238             :           call qr_pdgeqrf_pack_unpack_&
     239             :               &PRECISION &
     240             :               (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
     241           0 :                                       0,mpicomm_rows)
     242           0 :           broadcast_size = int(dbroadcast_size(1))
     243             : 
     244             :           !if (mpirank_rows .eq. 0) then
     245             :           ! pack tmatrix into broadcast buffer and calculate new size
     246             :           call qr_pdgeqrf_pack_unpack_tmatrix_&
     247             :               &PRECISION &
     248             :               (obj,tau(offset),t(voffset,voffset),ldt, &
     249           0 :                                               work(broadcast_offset+broadcast_size),lwork,lcols,0)
     250             :           call qr_pdgeqrf_pack_unpack_tmatrix_&
     251             :               &PRECISION &
     252           0 :               (obj,tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
     253           0 :           broadcast_size = broadcast_size + int(dtmat_bcast_size(1))
     254             :           !end if
     255             : 
     256             :           ! initiate broadcast (send part)
     257             : #ifdef WITH_MPI
     258             : 
     259             : #ifdef DOUBLE_PRECISION_REAL
     260             :           call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
     261           0 :                          mpirank_cols_qr,mpicomm_cols,mpierr)
     262             : #else
     263             :           call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real4, &
     264           0 :                          mpirank_cols_qr,mpicomm_cols,mpierr)
     265             : #endif
     266             : 
     267             : #endif
     268             :           ! copy tau parts into temporary tau buffer
     269           0 :           work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
     270             : 
     271             :           !print *,'generated tau:', tau(offset)
     272             :         else
     273             :           ! Vector exchange part
     274             : 
     275             :           ! determine broadcast size
     276             :           call qr_pdgeqrf_pack_unpack_&
     277             :               &PRECISION &
     278           0 :               (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
     279           0 :           broadcast_size = int(dbroadcast_size(1))
     280             : 
     281             :           call qr_pdgeqrf_pack_unpack_tmatrix_&
     282             :               &PRECISION &
     283             :               (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
     284           0 :                                               dtmat_bcast_size(1),-1,lcols,0)
     285           0 :           tmat_bcast_size = dtmat_bcast_size(1)
     286             : 
     287             :           !print *,'broadcast_size (nonqr)',broadcast_size
     288           0 :           broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
     289             : 
     290             :           ! initiate broadcast (recv part)
     291             : #ifdef WITH_MPI
     292             : 
     293             : #ifdef DOUBLE_PRECISION_REAL
     294             :           call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
     295           0 :                          mpirank_cols_qr,mpicomm_cols,mpierr)
     296             : #else
     297             :           call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real4, &
     298           0 :                          mpirank_cols_qr,mpicomm_cols,mpierr)
     299             : #endif
     300             : 
     301             : #endif
     302             :           ! last n*n elements in buffer are (still empty) T matrix elements
     303             :           ! fetch from first process in each column
     304             : 
     305             :           ! unpack broadcast buffer (v + tau)
     306             :           call qr_pdgeqrf_pack_unpack_&
     307             :               &PRECISION &
     308             :               (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols, &
     309           0 :               mb,rowidx,idx,rev,1,mpicomm_rows)
     310             : 
     311             :           ! now send t matrix to other processes in our process column
     312           0 :           broadcast_size = int(dbroadcast_size(1))
     313           0 :           tmat_bcast_size = int(dtmat_bcast_size(1))
     314             : 
     315             :           ! t matrix should now be available on all processes => unpack
     316             :           call qr_pdgeqrf_pack_unpack_tmatrix_&
     317             :               &PRECISION &
     318             :               (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
     319           0 :                                               work(broadcast_offset+broadcast_size),lwork,lcols,1)
     320             :         end if
     321             : 
     322           0 :         remaining_cols = remaining_cols - lcols
     323             : 
     324             :         ! apply householder vectors to whole trailing matrix parts (if any)
     325             : 
     326           0 :         update_voffset = voffset
     327           0 :         update_tauoffset = icol
     328           0 :         update_lcols = lcols
     329           0 :         incremental_update_size = incremental_update_size + lcols
     330             : 
     331           0 :         icol = icol - lcols
     332             :         ! count colums from first column of global block to current index
     333             :         call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
     334             :                                       mpirank_cols,mpiprocs_cols, &
     335           0 :                                       lcols,baseoffset,offset)
     336             : 
     337           0 :         if (lcols .gt. 0) then
     338             : 
     339             :           !print *,'updating trailing matrix'
     340             : 
     341           0 :            if (updatemode .eq. ichar('I')) then
     342           0 :              print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
     343           0 :            else if (updatemode .eq. ichar('F')) then
     344             :              ! full update no merging
     345             :              call qr_pdlarfb_1dcomm_&
     346             :                 &PRECISION &
     347             :                 (m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
     348             :                      work(temptau_offset+update_voffset-1),                          &
     349             :                                                         t(update_voffset,update_voffset),ldt, &
     350           0 :                     rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
     351             :            else
     352             :             ! full update + merging default
     353             :              call qr_tmerge_pdlarfb_1dcomm_&
     354             :                 &PRECISION &
     355             :                 (m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
     356             :                                                               v(1,update_voffset),ldv, &
     357             :                           t(update_voffset,update_voffset),ldt, &
     358             :                           a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
     359           0 :                                                               work(work_offset),lwork)
     360             :            end if
     361             :         else
     362           0 :            if (updatemode .eq. ichar('I')) then
     363             :              !print *,'sole merging of (incremental) T matrix', mpirank_cols,  &
     364             :             !                            n-(update_voffset+incremental_update_size-1)
     365             :              call qr_tmerge_pdlarfb_1dcomm_&
     366             :                 &PRECISION &
     367             :                 (m,mb,0,n-(update_voffset+incremental_update_size-1),   &
     368             :                                                               incremental_update_size,v(1,update_voffset),ldv, &
     369             :                           t(update_voffset,update_voffset),ldt, &
     370           0 :                           a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
     371             : 
     372             :              ! reset for upcoming incremental updates
     373           0 :              incremental_update_size = 0
     374           0 :           else if (updatemode .eq. ichar('M')) then
     375             :              ! final merge
     376             :             call qr_tmerge_pdlarfb_1dcomm_&
     377             :                 &PRECISION &
     378             :                 (m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
     379             :                                                               v(1,update_voffset),ldv, &
     380             :                           t(update_voffset,update_voffset),ldt, &
     381           0 :                           a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
     382             :           else
     383             :             ! full updatemode - nothing to update
     384             :           end if
     385             : 
     386             :           ! reset for upcoming incremental updates
     387           0 :           incremental_update_size = 0
     388             :         end if
     389             :       end do
     390             : 
     391           0 :       if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
     392             :         ! finally merge all small T parts
     393             :         call qr_pdlarft_tree_merge_1dcomm_&
     394             : &PRECISION &
     395           0 : (m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
     396             :       end if
     397             : 
     398             :       !print *,'stop decomposition',rowidx,colidx
     399             :       call obj%timer%stop("qr_pdgeqrf_2dcomm_&
     400             :           &PRECISION&
     401           0 :           &")
     402             :     end subroutine
     403             : 
     404             :     subroutine qr_pdgeqrf_1dcomm_&
     405           0 : &PRECISION &
     406           0 : (obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans, &
     407           0 :           PQRPARAM,mpicomm,blockheuristic)
     408             :       use precision
     409             :       use elpa1_impl
     410             :       use elpa_abstract_impl
     411             :       implicit none
     412             : 
     413             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     414             :       ! parameter setup
     415             :       INTEGER(kind=ik), parameter  :: gmode_ = 1,rank_ = 2,eps_ = 3
     416             : 
     417             :       ! input variables (local)
     418             :       integer(kind=ik)             :: lda,lwork,ldv,ldt
     419             :       real(kind=C_DATATYPE_KIND)                :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
     420             : 
     421             :       ! input variables (global)
     422             :       integer(kind=ik)             :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
     423             : #ifdef USE_ASSUMED_SIZE_QR
     424             :       integer(kind=ik)             :: PQRPARAM(*)
     425             : 
     426             : #else
     427             :       integer(kind=ik)             :: PQRPARAM(:)
     428             : #endif
     429             :       ! derived input variables
     430             : 
     431             :       ! derived further input variables from QR_PQRPARAM
     432             :       integer(kind=ik)             :: size1d,updatemode,tmerge
     433             : 
     434             :       ! output variables (global)
     435             :       real(kind=C_DATATYPE_KIND)                :: blockheuristic(*)
     436             : 
     437             :       ! local scalars
     438             :       integer(kind=ik)             :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
     439             :       real(kind=C_DATATYPE_KIND)                :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
     440             :       call obj%timer%start("qr_pdgeqrf_1dcomm_&
     441             :           &PRECISION&
     442           0 :           &")
     443           0 :       size1d     = max(min(PQRPARAM(1),n),1)
     444           0 :       updatemode = PQRPARAM(2)
     445           0 :       tmerge     = PQRPARAM(3)
     446             : 
     447           0 :       if (lwork .eq. -1) then
     448             : #ifdef USE_ASSUMED_SIZE_QR
     449             :         call qr_pdgeqr2_1dcomm_&
     450             : &PRECISION &
     451             : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
     452             :                                   m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)
     453             : #else
     454             :         call qr_pdgeqr2_1dcomm_&
     455             : &PRECISION &
     456             : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
     457           0 :                                   m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic)
     458             : #endif
     459             :         ! reserve more space for incremental mode
     460             :         call qr_tmerge_pdlarfb_1dcomm_&
     461             : &PRECISION &
     462             : (m,mb,n,n,n,v,ldv,t,ldt, &
     463           0 :                                          a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
     464             : 
     465             :         call qr_pdlarft_tree_merge_1dcomm_&
     466             : &PRECISION &
     467           0 : (m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)
     468             : 
     469           0 :         work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
     470             :       call obj%timer%stop("qr_pdgeqrf_1dcomm_&
     471             :           &PRECISION&
     472           0 :           &")
     473           0 :         return
     474             :       end if
     475             : 
     476           0 :       nr_blocks = n / size1d
     477           0 :       remainder = n - nr_blocks*size1d
     478             : 
     479           0 :       current_block = 0
     480           0 :       do while (current_block .lt. nr_blocks)
     481           0 :         idx = rowidx-current_block*size1d
     482           0 :         updatesize = n-(current_block+1)*size1d
     483           0 :         aoffset = 1+updatesize
     484             : #ifdef USE_ASSUMED_SIZE_QR
     485             :         call qr_pdgeqr2_1dcomm_&
     486             : &PRECISION &
     487             : (obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
     488             :                                 m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
     489             : 
     490             : #else
     491             :         call qr_pdgeqr2_1dcomm_&
     492             : &PRECISION &
     493             : (obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
     494           0 :                                 m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
     495             : #endif
     496           0 :         if (updatemode .eq. ichar('M')) then
     497             :           ! full update + merging
     498             :           call qr_tmerge_pdlarfb_1dcomm_&
     499             : &PRECISION &
     500             : (m,mb,updatesize,current_block*size1d,size1d, &
     501             :                                            v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
     502           0 :                                            a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
     503           0 :         else if (updatemode .eq. ichar('I')) then
     504           0 :           if (updatesize .ge. size1d) then
     505             :             ! incremental update + merging
     506             :             call qr_tmerge_pdlarfb_1dcomm_&
     507             : &PRECISION &
     508             : (m,mb,size1d,current_block*size1d,size1d, &
     509             :                                                v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
     510           0 :                                                a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)
     511             : 
     512             :           else ! only remainder left
     513             :              ! incremental update + merging
     514             :              call qr_tmerge_pdlarfb_1dcomm_&
     515             : &PRECISION &
     516             : (m,mb,remainder,current_block*size1d,size1d, &
     517             :                                                v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
     518           0 :                                                a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
     519             :           end if
     520             :         else ! full update no merging is default
     521             :           ! full update no merging
     522             :           call qr_pdlarfb_1dcomm_&
     523             : &PRECISION &
     524             : (m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
     525           0 :                                     tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
     526             :         end if
     527             : 
     528             :         ! move on to next block
     529           0 :         current_block = current_block+1
     530             :       end do
     531             : 
     532           0 :       if (remainder .gt. 0) then
     533           0 :         aoffset = 1
     534           0 :         idx = rowidx-size1d*nr_blocks
     535             : #ifdef USE_ASSUMED_SIZE_QR
     536             :         call qr_pdgeqr2_1dcomm_&
     537             : &PRECISION &
     538             : (obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
     539             :                                   m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
     540             : 
     541             : #else
     542             :         call qr_pdgeqr2_1dcomm_&
     543             : &PRECISION &
     544             : (obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
     545           0 :                                   m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
     546             : #endif
     547           0 :         if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
     548             :           ! final merging
     549             :           call qr_tmerge_pdlarfb_1dcomm_&
     550             : &PRECISION &
     551             : (m,mb,0,size1d*nr_blocks,remainder, &
     552             :                                              v,ldv,t,ldt, &
     553           0 :                                              a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
     554             :         end if
     555             :       end if
     556             : 
     557           0 :       if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
     558             :         ! finally merge all small T parts
     559             :         call qr_pdlarft_tree_merge_1dcomm_&
     560             : &PRECISION &
     561           0 : (m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
     562             :       end if
     563             :       call obj%timer%stop("qr_pdgeqrf_1dcomm_&
     564             :           &PRECISION&
     565           0 :           &")
     566             : 
     567             :     end subroutine
     568             : 
     569             :     ! local a and tau are assumed to be positioned at the right column from a local
     570             :     ! perspective
     571             :     ! TODO: if local amount of data turns to zero the algorithm might produce wrong
     572             :     ! results (probably due to old buffer contents)
     573             :     subroutine qr_pdgeqr2_1dcomm_&
     574           0 : &PRECISION &
     575           0 : (obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev, &
     576           0 :           trans,PQRPARAM,mpicomm,blockheuristic)
     577             :       use precision
     578             :       !use elpa1_impl ! check this
     579             :       use elpa_abstract_impl
     580             :       implicit none
     581             : 
     582             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     583             :       ! parameter setup
     584             :       INTEGER(kind=ik), parameter   :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
     585             : 
     586             :       ! input variables (local)
     587             :       integer(kind=ik)              :: lda,lwork,ldv,ldt
     588             :       real(kind=C_DATATYPE_KIND)                 :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
     589             : 
     590             :       ! input variables (global)
     591             :       integer(kind=ik)              :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
     592             : #ifdef USE_ASSUMED_SIZE_QR
     593             :       integer(kind=ik)              :: PQRPARAM(*)
     594             : #else
     595             :       integer(kind=ik)              :: PQRPARAM(:)
     596             : #endif
     597             :       ! output variables (global)
     598             :       real(kind=C_DATATYPE_KIND)                 :: blockheuristic(*)
     599             : 
     600             :       ! derived further input variables from QR_PQRPARAM
     601             :       integer(kind=ik)              ::  maxrank,hgmode,updatemode
     602             : 
     603             :       ! local scalars
     604             :       integer(kind=ik)              :: icol,incx,idx
     605             :       real(kind=C_DATATYPE_KIND)                 :: pdlarfg_size(1),pdlarf_size(1),total_size
     606             :       real(kind=C_DATATYPE_KIND)                 :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
     607             :       real(kind=C_DATATYPE_KIND)                 :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
     608             :       integer(kind=ik)              :: mpirank,mpiprocs,mpierr
     609             :       integer(kind=ik)              :: rank,lastcol,actualrank,nextrank
     610             :       integer(kind=ik)              :: update_cols,decomposition_cols
     611             :       integer(kind=ik)              :: current_column
     612             :       call obj%timer%start("qr_pdgeqr2_1dcomm_&
     613             :           &PRECISION&
     614           0 :           &")
     615             : 
     616           0 :       maxrank    = min(PQRPARAM(1),n)
     617           0 :       updatemode = PQRPARAM(2)
     618           0 :       hgmode     = PQRPARAM(4)
     619           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
     620           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
     621           0 :       if (trans .eq. 1) then
     622           0 :         incx = lda
     623             :       else
     624           0 :         incx = 1
     625             :       end if
     626             : 
     627           0 :       if (lwork .eq. -1) then
     628             : 
     629             :         call qr_pdlarfg_1dcomm_&
     630             : &PRECISION &
     631           0 : (obj,a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
     632             :         call qr_pdlarfl_1dcomm_&
     633             : &PRECISION &
     634           0 : (v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
     635             : #ifdef USE_ASSUMED_SIZE_QR
     636             :         call qr_pdlarfg2_1dcomm_ref_&
     637             : &PRECISION &
     638             : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, &
     639             :                                     rev,mpicomm,actualrank)
     640             : 
     641             :         call qr_pdlarfgk_1dcomm_&
     642             : &PRECISION &
     643             : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
     644             : 
     645             : #else
     646             :         call qr_pdlarfg2_1dcomm_ref_&
     647             : &PRECISION &
     648             : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), &
     649           0 :                                     rev,mpicomm,actualrank)
     650             : 
     651             :         call qr_pdlarfgk_1dcomm_&
     652             : &PRECISION &
     653             : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n, &
     654           0 :             rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank)
     655             : #endif
     656             :         call qr_pdlarfl2_tmatrix_1dcomm_&
     657             : &PRECISION &
     658           0 : (v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
     659             : #ifdef DOUBLE_PRECISION_REAL
     660           0 :         pdlarft_size(1) = 0.0_rk8
     661             : #else
     662           0 :         pdlarft_size(1) = 0.0_rk4
     663             : #endif
     664             :         call qr_pdlarfb_1dcomm_&
     665             : &PRECISION &
     666           0 : (m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
     667             : #ifdef DOUBLE_PRECISION_REAL
     668           0 :         pdlarft_pdlarfb_size(1) = 0.0_rk8
     669             : #else
     670           0 :         pdlarft_pdlarfb_size(1) = 0.0_rk4
     671             : #endif
     672             :         call qr_tmerge_pdlarfb_1dcomm_&
     673             : &PRECISION &
     674             : (m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,&
     675           0 :             updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)
     676             : 
     677             :         total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
     678           0 :                          pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))
     679             : 
     680           0 :         work(1) = total_size
     681             :       call obj%timer%stop("qr_pdgeqr2_1dcomm_&
     682             :           &PRECISION&
     683           0 :           &")
     684           0 :         return
     685             :       end if
     686             : 
     687           0 :       icol = 1
     688           0 :       lastcol = min(rowidx,n)
     689           0 :       decomposition_cols = lastcol
     690           0 :       update_cols = n
     691           0 :       do while (decomposition_cols .gt. 0) ! local qr block
     692           0 :         icol = lastcol-decomposition_cols+1
     693           0 :         idx = rowidx-icol+1
     694             : 
     695             :         ! get possible rank size
     696             :         ! limited by number of columns and remaining rows
     697           0 :         rank = min(n-icol+1,maxrank,idx)
     698             : 
     699           0 :         current_column = n-icol+1-rank+1
     700             : 
     701           0 :         if (rank .eq. 1) then
     702             : 
     703             :           call qr_pdlarfg_1dcomm_&
     704             : &PRECISION &
     705             : (obj,a(1,current_column),incx, &
     706             :                                   tau(current_column),work,lwork, &
     707           0 :                                   m,idx,mb,hgmode,1,mpicomm)
     708             : #ifdef DOUBLE_PRECISION_REAL
     709           0 :           v(1:ldv,current_column) = 0.0_rk8
     710             : #else
     711           0 :           v(1:ldv,current_column) = 0.0_rk4
     712             : #endif
     713             :           call qr_pdlarfg_copy_1dcomm_&
     714             : &PRECISION &
     715             : (obj,a(1,current_column),incx, &
     716             :                                        v(1,current_column),1, &
     717           0 :                                        m,baseidx,idx,mb,1,mpicomm)
     718             : 
     719             :           ! initialize t matrix part
     720           0 :           t(current_column,current_column) = tau(current_column)
     721             : 
     722           0 :           actualrank = 1
     723             : 
     724           0 :         else if (rank .eq. 2) then
     725             : #ifdef USE_ASSUMED_SIZE_QR
     726             :           call qr_pdlarfg2_1dcomm_ref_&
     727             : &PRECISION &
     728             : (obj,a(1,current_column),lda,tau(current_column), &
     729             :                                          t(current_column,current_column),ldt,v(1,current_column),ldv, &
     730             :                                         baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)
     731             : 
     732             : #else
     733             :           call qr_pdlarfg2_1dcomm_ref_&
     734             : &PRECISION &
     735             : (obj,a(1,current_column),lda,tau(current_column), &
     736             :                                          t(current_column,current_column),ldt,v(1,current_column),ldv, &
     737           0 :                                         baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
     738             : #endif
     739             :         else
     740             : #ifdef USE_ASSUMED_SIZE_QR
     741             :           call qr_pdlarfgk_1dcomm_&
     742             : &PRECISION &
     743             : (obj,a(1,current_column),lda,tau(current_column), &
     744             :                                      t(current_column,current_column),ldt,v(1,current_column),ldv, &
     745             :                                      baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
     746             : 
     747             : #else
     748             :           call qr_pdlarfgk_1dcomm_&
     749             : &PRECISION &
     750             : (obj,a(1,current_column),lda,tau(current_column), &
     751             :                                      t(current_column,current_column),ldt,v(1,current_column),ldv, &
     752           0 :                                      baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
     753             : #endif
     754             :         end if
     755             : 
     756           0 :         blockheuristic(actualrank) = blockheuristic(actualrank) + 1
     757             : 
     758             :         ! the blocked decomposition versions already updated their non
     759             :         ! decomposed parts using their information after communication
     760           0 :         update_cols = decomposition_cols - rank
     761           0 :         decomposition_cols = decomposition_cols - actualrank
     762             : 
     763             :         ! needed for incremental update
     764           0 :         nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)
     765             : 
     766           0 :         if (current_column .gt. 1) then
     767           0 :           idx = rowidx-icol+1
     768             : 
     769           0 :           if (updatemode .eq. ichar('I')) then
     770             :             ! incremental update + merging
     771             :             call qr_tmerge_pdlarfb_1dcomm_&
     772             : &PRECISION &
     773             : (m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
     774             :                                           v(1,current_column+(rank-actualrank)),ldv, &
     775             :                                           t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
     776             :                                           a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
     777           0 :                                           mpicomm,work,lwork)
     778             :           else
     779             :             ! full update + merging
     780             :             call qr_tmerge_pdlarfb_1dcomm_&
     781             : &PRECISION &
     782             : (m,mb,update_cols,n-(current_column+rank-1),actualrank, &
     783             :                                           v(1,current_column+(rank-actualrank)),ldv, &
     784             :                                           t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
     785           0 :                                           a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
     786             :           end if
     787             :         else
     788             :           call qr_tmerge_pdlarfb_1dcomm_&
     789             : &PRECISION &
     790             : (m,mb,0,n-(current_column+rank-1),actualrank, &
     791             :               v(1,current_column+(rank-actualrank)), &
     792             :                                           ldv, &
     793             :                                           t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
     794           0 :                                            a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
     795             :         end if
     796             : 
     797             :       end do
     798             :       call obj%timer%stop("qr_pdgeqr2_1dcomm_&
     799             :           &PRECISION&
     800           0 :           &")
     801             :     end subroutine
     802             : 
     803             :     ! incx == 1: column major
     804             :     ! incx != 1: row major
     805             :     subroutine qr_pdlarfg_1dcomm_&
     806           0 : &PRECISION &
     807             : (obj,x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,communicator)
     808             : 
     809             :       use precision
     810             :       !use elpa1_impl !check this
     811             :       use qr_utils_mod
     812             :       use elpa_abstract_impl
     813             :       implicit none
     814             : 
     815             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     816             :       ! parameter setup
     817             :       INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2, eps_ = 3
     818             : 
     819             :       ! input variables (local)
     820             :       integer(kind=ik)               :: incx,lwork,hgmode
     821             :       real(kind=C_DATATYPE_KIND)                  :: x(*),work(*)
     822             : 
     823             :       ! input variables (global)
     824             :       integer(kind=ik)               :: communicator,nb,idx,n,rev
     825             : 
     826             :       ! output variables (global)
     827             :       real(kind=C_DATATYPE_KIND)                  :: tau
     828             : 
     829             :       ! local scalars
     830             :       integer(kind=ik)               :: mpierr,mpirank,mpiprocs,mpirank_top
     831             :       integer(kind=ik)               :: sendsize,recvsize
     832             :       integer(kind=ik)               :: local_size,local_offset,baseoffset
     833             :       integer(kind=ik)               :: topidx,top,iproc
     834             :       real(kind=C_DATATYPE_KIND)                  :: alpha,xnorm,dot,xf
     835             : 
     836             :       ! external functions
     837             : #ifdef DOUBLE_PRECISION_REAL
     838             :       real(kind=C_DATATYPE_KIND), external        :: ddot,dlapy2,dnrm2
     839             : #else
     840             :       real(kind=C_DATATYPE_KIND), external        :: sdot,slapy2,snrm2
     841             : #endif
     842             :       external                       :: dscal
     843             : 
     844             :       ! intrinsic
     845             : !      intrinsic sign
     846             :       call obj%timer%start("qr_pdlarfg_1dcomm_&
     847             :           &PRECISION&
     848           0 :           &")
     849           0 :       if (idx .le. 1) then
     850           0 :         tau = 0.0d0
     851             :       call obj%timer%stop("qr_pdlarfg_1dcomm_&
     852             :           &PRECISION&
     853           0 :           &")
     854           0 :         return
     855             :        end if
     856           0 :       call MPI_Comm_rank(communicator, mpirank, mpierr)
     857           0 :       call MPI_Comm_size(communicator, mpiprocs, mpierr)
     858             :       ! calculate expected work size and store in work(1)
     859           0 :       if (hgmode .eq. ichar('s')) then
     860             :         ! allreduce (MPI_SUM)
     861           0 :         sendsize = 2
     862           0 :         recvsize = sendsize
     863           0 :       else if (hgmode .eq. ichar('x')) then
     864             :         ! alltoall
     865           0 :         sendsize = mpiprocs*2
     866           0 :         recvsize = sendsize
     867           0 :       else if (hgmode .eq. ichar('g')) then
     868             :         ! allgather
     869           0 :         sendsize = 2
     870           0 :         recvsize = mpiprocs*sendsize
     871             :       else
     872             :         ! no exchange at all (benchmarking)
     873           0 :         sendsize = 2
     874           0 :         recvsize = sendsize
     875             :       end if
     876             : 
     877           0 :       if (lwork .eq. -1) then
     878             : #ifdef DOUBLE_PRECISION_REAL
     879           0 :         work(1) = real(sendsize + recvsize,kind=C_DATATYPE_KIND)
     880             : #else
     881           0 :         work(1) = real(sendsize + recvsize,kind=rk4)
     882             : #endif
     883             : 
     884             :       call obj%timer%stop("qr_pdlarfg_1dcomm_&
     885             :           &PRECISION&
     886           0 :           &")
     887           0 :         return
     888             :       end if
     889             : 
     890             :       ! Processor id for global index of top element
     891           0 :       mpirank_top = MOD((idx-1)/nb,mpiprocs)
     892           0 :       if (mpirank .eq. mpirank_top) then
     893           0 :         topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
     894           0 :         top = 1+(topidx-1)*incx
     895             :       end if
     896             : 
     897             :       call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
     898           0 :                         local_size,baseoffset,local_offset)
     899             : 
     900           0 :       local_offset = local_offset * incx
     901             : 
     902             :       ! calculate and exchange information
     903           0 :       if (hgmode .eq. ichar('s')) then
     904           0 :         if (mpirank .eq. mpirank_top) then
     905           0 :           alpha = x(top)
     906             :         else
     907             : #ifdef DOUBLE_PRECISION_REAL
     908           0 :           alpha = 0.0_rk8
     909             : #else
     910           0 :           alpha = 0.0_rk4
     911             : #endif
     912             :         end if
     913             : #ifdef DOUBLE_PRECISION_REAL
     914             :         dot = ddot(local_size, &
     915             :                      x(local_offset), incx, &
     916           0 :                      x(local_offset), incx)
     917             : #else
     918             :         dot = sdot(local_size, &
     919             :                      x(local_offset), incx, &
     920           0 :                      x(local_offset), incx)
     921             : #endif
     922           0 :         work(1) = alpha
     923           0 :         work(2) = dot
     924             : #ifdef WITH_MPI
     925             : 
     926             : #ifdef DOUBLE_PRECISION_REAL
     927             :         call mpi_allreduce(work(1),work(sendsize+1), &
     928             :                              sendsize,mpi_real8,mpi_sum, &
     929           0 :                              communicator,mpierr)
     930             : #else
     931             :         call mpi_allreduce(work(1),work(sendsize+1), &
     932             :                              sendsize,mpi_real4,mpi_sum, &
     933           0 :                              communicator,mpierr)
     934             : #endif
     935             : 
     936             : #else
     937           0 :         work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
     938             : #endif
     939           0 :         alpha = work(sendsize+1)
     940           0 :         xnorm = sqrt(work(sendsize+2))
     941           0 :       else if (hgmode .eq. ichar('x')) then
     942           0 :         if (mpirank .eq. mpirank_top) then
     943           0 :           alpha = x(top)
     944             :         else
     945             : #ifdef DOUBLE_PRECISION_REAL
     946           0 :           alpha = 0.0_rk8
     947             : #else
     948           0 :           alpha = 0.0_rk4
     949             : #endif
     950             :         end if
     951             : #ifdef DOUBLE_PRECISION_REAL
     952           0 :         xnorm = dnrm2(local_size, x(local_offset), incx)
     953             : #else
     954           0 :         xnorm = snrm2(local_size, x(local_offset), incx)
     955             : #endif
     956           0 :         do iproc=0,mpiprocs-1
     957           0 :           work(2*iproc+1) = alpha
     958           0 :           work(2*iproc+2) = xnorm
     959             :         end do
     960             : #ifdef WITH_MPI
     961             : 
     962             : #ifdef DOUBLE_PRECISION_REAL
     963             :         call mpi_alltoall(work(1),2,mpi_real8, &
     964             :                             work(sendsize+1),2,mpi_real8, &
     965           0 :                             communicator,mpierr)
     966             : #else
     967             :         call mpi_alltoall(work(1),2,mpi_real4, &
     968             :                             work(sendsize+1),2,mpi_real4, &
     969           0 :                             communicator,mpierr)
     970             : #endif
     971             : 
     972             : #else
     973           0 :         work(sendsize+1:sendsize+1+2-1) = work(1:2)
     974             : #endif
     975             :         ! extract alpha value
     976           0 :         alpha = work(sendsize+1+mpirank_top*2)
     977             : 
     978             :         ! copy norm parts of buffer to beginning
     979           0 :         do iproc=0,mpiprocs-1
     980           0 :           work(iproc+1) = work(sendsize+1+2*iproc+1)
     981             :         end do
     982             : 
     983             : #ifdef DOUBLE_PRECISION_REAL
     984           0 :         xnorm = dnrm2(mpiprocs, work(1), 1)
     985             : #else
     986           0 :         xnorm = snrm2(mpiprocs, work(1), 1)
     987             : #endif
     988           0 :       else if (hgmode .eq. ichar('g')) then
     989           0 :         if (mpirank .eq. mpirank_top) then
     990           0 :           alpha = x(top)
     991             :         else
     992             : #ifdef DOUBLE_PRECISION_REAL
     993           0 :           alpha = 0.0_rk8
     994             : #else
     995           0 :           alpha = 0.0_rk4
     996             : #endif
     997             :         end if
     998             : #ifdef DOUBLE_PRECISION_REAL
     999           0 :         xnorm = dnrm2(local_size, x(local_offset), incx)
    1000             : #else
    1001           0 :         xnorm = snrm2(local_size, x(local_offset), incx)
    1002             : #endif
    1003           0 :         work(1) = alpha
    1004           0 :         work(2) = xnorm
    1005             : 
    1006             :         ! allgather
    1007             : #ifdef WITH_MPI
    1008             : 
    1009             : #ifdef DOUBLE_PRECISION_REAL
    1010             :         call mpi_allgather(work(1),sendsize,mpi_real8, &
    1011             :                             work(sendsize+1),sendsize,mpi_real8, &
    1012           0 :                             communicator,mpierr)
    1013             : #else
    1014             :         call mpi_allgather(work(1),sendsize,mpi_real4, &
    1015             :                             work(sendsize+1),sendsize,mpi_real4, &
    1016           0 :                             communicator,mpierr)
    1017             : #endif
    1018             : 
    1019             : #else
    1020           0 :        work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
    1021             : #endif
    1022             :         ! extract alpha value
    1023           0 :         alpha = work(sendsize+1+mpirank_top*2)
    1024             : 
    1025             :         ! copy norm parts of buffer to beginning
    1026           0 :         do iproc=0,mpiprocs-1
    1027           0 :           work(iproc+1) = work(sendsize+1+2*iproc+1)
    1028             :         end do
    1029             : #ifdef DOUBLE_PRECISION_REAL
    1030           0 :         xnorm = dnrm2(mpiprocs, work(1), 1)
    1031             : #else
    1032           0 :         xnorm = snrm2(mpiprocs, work(1), 1)
    1033             : #endif
    1034             :       else
    1035             :         ! dnrm2
    1036             : #ifdef DOUBLE_PRECISION_REAL
    1037           0 :         xnorm = dnrm2(local_size, x(local_offset), incx)
    1038             : #else
    1039           0 :         xnorm = snrm2(local_size, x(local_offset), incx)
    1040             : #endif
    1041           0 :         if (mpirank .eq. mpirank_top) then
    1042           0 :           alpha = x(top)
    1043             :         else
    1044             : #ifdef DOUBLE_PRECISION_REAL
    1045           0 :           alpha = 0.0_rk8
    1046             : #else
    1047           0 :           alpha = 0.0_rk4
    1048             : #endif
    1049             :         end if
    1050             : 
    1051             :         ! no exchange at all (benchmarking)
    1052             : #ifdef DOUBLE_PRECISION_REAL
    1053           0 :         xnorm = 0.0_rk8
    1054             : #else
    1055           0 :         xnorm = 0.0_rk4
    1056             : #endif
    1057             :       end if
    1058             : 
    1059             :       !print *,'ref hg:', idx,xnorm,alpha
    1060             :       !print *,x(1:n)
    1061             : 
    1062             :       ! calculate householder information
    1063             : #ifdef DOUBLE_PRECISION_REAL
    1064           0 :       if (xnorm .eq. 0.0_rk8) then
    1065             :         ! H = I
    1066             : 
    1067           0 :         tau = 0.0_rk8
    1068             : #else
    1069           0 :       if (xnorm .eq. 0.0_rk4) then
    1070             :         ! H = I
    1071             : 
    1072           0 :         tau = 0.0_rk4
    1073             : #endif
    1074             :       else
    1075             :         ! General case
    1076             :         call hh_transform_real_&
    1077             : &PRECISION &
    1078           0 : (obj,alpha,xnorm**2,xf,tau, .false.)
    1079           0 :         if (mpirank .eq. mpirank_top) then
    1080           0 :           x(top) = alpha
    1081             :         end if
    1082             : #ifdef DOUBLE_PRECISION_REAL
    1083             :         call dscal(local_size, xf, &
    1084           0 :                      x(local_offset), incx)
    1085             : #else
    1086             :         call sscal(local_size, xf, &
    1087           0 :                      x(local_offset), incx)
    1088             : #endif
    1089             : 
    1090             :         ! TODO: reimplement norm rescale method of
    1091             :         ! original PDLARFG using mpi?
    1092             : 
    1093             :       end if
    1094             : 
    1095             :       ! useful for debugging
    1096             :       !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
    1097             :       !print *,x(1:n)
    1098             :       call obj%timer%stop("qr_pdlarfg_1dcomm_&
    1099             :           &PRECISION&
    1100           0 :           &")
    1101             :     end subroutine
    1102             : 
    1103             :     subroutine qr_pdlarfg2_1dcomm_ref_&
    1104           0 : &PRECISION &
    1105           0 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
    1106             :       use precision
    1107             :       use elpa_abstract_impl
    1108             :       implicit none
    1109             : 
    1110             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1111             :       ! parameter setup
    1112             :       INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
    1113             :       ! input variables (local)
    1114             :       integer(kind=ik)               :: lda,lwork,ldv,ldt
    1115             :       real(kind=C_DATATYPE_KIND)                  :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
    1116             : 
    1117             :       ! input variables (global)
    1118             :       integer(kind=ik)               :: m,idx,baseidx,mb,rev,mpicomm
    1119             : #ifdef USE_ASSUMED_SIZE_QR
    1120             :       integer(kind=ik)               :: PQRPARAM(*)
    1121             : #else
    1122             :       integer(kind=ik)               :: PQRPARAM(:)
    1123             : #endif
    1124             :       ! output variables (global)
    1125             :       integer(kind=ik)               :: actualk
    1126             : 
    1127             :       ! derived input variables from QR_PQRPARAM
    1128             :       integer(kind=ik)               :: eps
    1129             : 
    1130             :       ! local scalars
    1131             :       real(kind=C_DATATYPE_KIND)                  :: dseedwork_size(1)
    1132             :       integer(kind=ik)               :: seedwork_size,seed_size
    1133             :       integer(kind=ik)               :: seedwork_offset,seed_offset
    1134             :       logical                        :: accurate
    1135             :       call obj%timer%start("qr_pdlarfg2_1dcomm_&
    1136             :           &PRECISION&
    1137           0 :           &")
    1138             : 
    1139             :       call qr_pdlarfg2_1dcomm_seed_&
    1140             : &PRECISION &
    1141           0 : (obj,a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
    1142           0 :       seedwork_size = int(dseedwork_size(1))
    1143           0 :       seed_size = seedwork_size
    1144             : 
    1145           0 :       if (lwork .eq. -1) then
    1146           0 :         work(1) = seedwork_size + seed_size
    1147             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_&
    1148             :           &PRECISION&
    1149           0 :           &")
    1150             : 
    1151           0 :         return
    1152             :       end if
    1153             : 
    1154           0 :       seedwork_offset = 1
    1155           0 :       seed_offset = seedwork_offset + seedwork_size
    1156             : 
    1157           0 :       eps = PQRPARAM(3)
    1158             : 
    1159             :       ! check for border cases (only a 2x2 matrix left)
    1160           0 :       if (idx .le. 1) then
    1161             : #ifdef DOUBLE_PRECISION_REAL
    1162           0 :         tau(1:2) = 0.0_rk8
    1163           0 :          t(1:2,1:2) = 0.0_rk8
    1164             : #else
    1165           0 :         tau(1:2) = 0.0_rk4
    1166           0 :          t(1:2,1:2) = 0.0_rk4
    1167             : #endif
    1168             : 
    1169             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_&
    1170             :           &PRECISION&
    1171           0 :           &")
    1172             : 
    1173           0 :          return
    1174             :       end if
    1175             : 
    1176             :       call qr_pdlarfg2_1dcomm_seed_&
    1177             : &PRECISION &
    1178           0 : (obj,a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)
    1179             : 
    1180           0 :       if (eps .gt. 0) then
    1181             :         accurate = qr_pdlarfg2_1dcomm_check_&
    1182             : &PRECISION &
    1183           0 : (obj,work(seed_offset),eps)
    1184             :       else
    1185           0 :         accurate = .true.
    1186             :       end if
    1187             : 
    1188             :       call qr_pdlarfg2_1dcomm_vector_&
    1189             : &PRECISION &
    1190             : (obj,a(1,2),1,tau(2),work(seed_offset), &
    1191           0 :                                           m,mb,idx,0,1,mpicomm)
    1192             : 
    1193             :       call qr_pdlarfg_copy_1dcomm_&
    1194             : &PRECISION &
    1195             : (obj,a(1,2),1, &
    1196             :                                        v(1,2),1, &
    1197           0 :                                        m,baseidx,idx,mb,1,mpicomm)
    1198             : 
    1199             :       call qr_pdlarfg2_1dcomm_update_&
    1200             : &PRECISION &
    1201           0 : (obj,v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)
    1202             : 
    1203             :       ! check for 2x2 matrix case => only one householder Vector will be
    1204             :       ! generated
    1205           0 :       if (idx .gt. 2) then
    1206           0 :         if (accurate .eqv. .true.) then
    1207             :           call qr_pdlarfg2_1dcomm_vector_&
    1208             : &PRECISION &
    1209             : (obj,a(1,1),1,tau(1),work(seed_offset), &
    1210           0 :                                                   m,mb,idx-1,1,1,mpicomm)
    1211             : 
    1212             :           call qr_pdlarfg_copy_1dcomm_&
    1213             : &PRECISION &
    1214             : (obj,a(1,1),1, &
    1215             :                                                v(1,1),1, &
    1216           0 :                                                m,baseidx,idx-1,mb,1,mpicomm)
    1217             : 
    1218             :           ! generate fuse element
    1219             :           call qr_pdlarfg2_1dcomm_finalize_tmatrix_&
    1220             : &PRECISION &
    1221           0 : (obj,work(seed_offset),tau,t,ldt)
    1222             : 
    1223           0 :           actualk = 2
    1224             :         else
    1225             : #ifdef DOUBLE_PRECISION_REAL
    1226           0 :           t(1,1) = 0.0_rk8
    1227           0 :           t(1,2) = 0.0_rk8
    1228             : #else
    1229           0 :           t(1,1) = 0.0_rk4
    1230           0 :           t(1,2) = 0.0_rk4
    1231             : #endif
    1232           0 :           t(2,2) = tau(2)
    1233             : 
    1234           0 :           actualk = 1
    1235             :         end if
    1236             :       else
    1237             : #ifdef DOUBLE_PRECISION_REAL
    1238           0 :         t(1,1) = 0.0_rk8
    1239           0 :         t(1,2) = 0.0_rk8
    1240             : #else
    1241           0 :         t(1,1) = 0.0_rk4
    1242           0 :         t(1,2) = 0.0_rk4
    1243             : #endif
    1244           0 :         t(2,2) = tau(2)
    1245             : 
    1246             :         ! no more vectors to create
    1247             : #ifdef DOUBLE_PRECISION_REAL
    1248           0 :         tau(1) = 0.0_rk8
    1249             : #else
    1250           0 :         tau(1) = 0.0_rk4
    1251             : #endif
    1252             : 
    1253           0 :         actualk = 2
    1254             : 
    1255             :         !print *,'rank2: no more data'
    1256             :       end if
    1257             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_&
    1258             :           &PRECISION&
    1259           0 :           &")
    1260             : 
    1261             :     end subroutine
    1262             : 
    1263             :     subroutine qr_pdlarfg2_1dcomm_seed_&
    1264           0 : &PRECISION &
    1265           0 : (obj,a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
    1266             :       use precision
    1267             :       !use elpa1_impl ! check this
    1268             :       use qr_utils_mod
    1269             :       use elpa_abstract_impl
    1270             :       implicit none
    1271             : 
    1272             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1273             :       ! input variables (local)
    1274             :       integer(kind=ik)        :: lda,lwork
    1275             :       real(kind=C_DATATYPE_KIND)           :: a(lda,*),work(*),seed(*)
    1276             : 
    1277             :       ! input variables (global)
    1278             :       integer(kind=ik)        :: n,nb,idx,rev,mpicomm
    1279             : 
    1280             :       ! output variables (global)
    1281             : 
    1282             :       ! external functions
    1283             : #ifdef DOUBLE_PRECISION_REAL
    1284             :       real(kind=C_DATATYPE_KIND), external :: ddot
    1285             : #else
    1286             :       real(kind=C_DATATYPE_KIND), external :: sdot
    1287             : #endif
    1288             :       ! local scalars
    1289             :       real(kind=C_DATATYPE_KIND)           :: top11,top21,top12,top22
    1290             :       real(kind=C_DATATYPE_KIND)           :: dot11,dot12,dot22
    1291             :       integer(kind=ik)        :: mpirank,mpiprocs,mpierr
    1292             :       integer(kind=ik)        :: mpirank_top11,mpirank_top21
    1293             :       integer(kind=ik)        :: top11_offset,top21_offset
    1294             :       integer(kind=ik)        :: baseoffset
    1295             :       integer(kind=ik)        :: local_offset1,local_size1
    1296             :       integer(kind=ik)        :: local_offset2,local_size2
    1297             : 
    1298             :       call obj%timer%start("qr_pdlarfg2_1dcomm_seed_&
    1299             :           &PRECISION&
    1300           0 :           &")
    1301             : 
    1302           0 :       if (lwork .eq. -1) then
    1303             : #ifdef DOUBLE_PRECISION_REAL
    1304           0 :         work(1) = 8.0_rk8
    1305             : #else
    1306           0 :         work(1) = 8.0_rk4
    1307             : #endif
    1308             : 
    1309             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
    1310             :           &PRECISION&
    1311           0 :           &")
    1312           0 :         return
    1313             :       end if
    1314           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    1315           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    1316             :       call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
    1317           0 :                                 local_size1,baseoffset,local_offset1)
    1318             : 
    1319             :       call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
    1320           0 :                                 local_size2,baseoffset,local_offset2)
    1321             : 
    1322           0 :       mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
    1323           0 :       mpirank_top21 = MOD((idx-2)/nb,mpiprocs)
    1324             : 
    1325           0 :       top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
    1326           0 :       top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)
    1327             : 
    1328           0 :       if (mpirank_top11 .eq. mpirank) then
    1329           0 :         top11 = a(top11_offset,2)
    1330           0 :         top12 = a(top11_offset,1)
    1331             :       else
    1332             : #ifdef DOUBLE_PRECISION_REAL
    1333           0 :         top11 = 0.0_rk8
    1334           0 :         top12 = 0.0_rk8
    1335             : #else
    1336           0 :         top11 = 0.0_rk4
    1337           0 :         top12 = 0.0_rk4
    1338             : #endif
    1339             :       end if
    1340             : 
    1341           0 :       if (mpirank_top21 .eq. mpirank) then
    1342           0 :         top21 = a(top21_offset,2)
    1343           0 :         top22 = a(top21_offset,1)
    1344             :       else
    1345             : #ifdef DOUBLE_PRECISION_REAL
    1346           0 :         top21 = 0.0_rk8
    1347           0 :         top22 = 0.0_rk8
    1348             : #else
    1349           0 :         top21 = 0.0_rk4
    1350           0 :         top22 = 0.0_rk4
    1351             : #endif
    1352             :       end if
    1353             : 
    1354             :       ! calculate 3 dot products
    1355             : #ifdef DOUBLE_PRECISION_REAL
    1356           0 :       dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
    1357           0 :       dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
    1358           0 :       dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
    1359             : #else
    1360           0 :       dot11 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
    1361           0 :       dot12 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
    1362           0 :       dot22 = sdot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
    1363             : #endif
    1364             :       ! store results in work buffer
    1365           0 :       work(1) = top11
    1366           0 :       work(2) = dot11
    1367           0 :       work(3) = top12
    1368           0 :       work(4) = dot12
    1369           0 :       work(5) = top21
    1370           0 :       work(6) = top22
    1371           0 :       work(7) = dot22
    1372             : #ifdef DOUBLE_PRECISION_REAL
    1373           0 :       work(8) = 0.0_rk8! fill up buffer
    1374             : #else
    1375           0 :       work(8) = 0.0_rk4! fill up buffer
    1376             : #endif
    1377             :       ! exchange partial results
    1378             : #ifdef WITH_MPI
    1379             : 
    1380             : #ifdef DOUBLE_PRECISION_REAL
    1381             :       call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
    1382           0 :                          mpicomm, mpierr)
    1383             : #else
    1384             :       call mpi_allreduce(work, seed, 8, mpi_real4, mpi_sum, &
    1385           0 :                          mpicomm, mpierr)
    1386             : #endif
    1387             : 
    1388             : #else
    1389           0 :       seed(1:8) = work(1:8)
    1390             : #endif
    1391             : 
    1392             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
    1393             :           &PRECISION&
    1394           0 :           &")
    1395             :     end subroutine
    1396             : 
    1397             :     logical function qr_pdlarfg2_1dcomm_check_&
    1398           0 : &PRECISION &
    1399             : (obj,seed,eps)
    1400             :       use precision
    1401             :       use elpa_abstract_impl
    1402             :       implicit none
    1403             : 
    1404             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1405             : 
    1406             :       ! input variables
    1407             :       real(kind=C_DATATYPE_KIND)    ::  seed(*)
    1408             :       integer(kind=ik) :: eps
    1409             : 
    1410             :       ! local scalars
    1411             :       real(kind=C_DATATYPE_KIND)    :: epsd,first,second,first_second,estimate
    1412             :       logical          :: accurate
    1413             :       real(kind=C_DATATYPE_KIND)    :: dot11,dot12,dot22
    1414             :       real(kind=C_DATATYPE_KIND)    :: top11,top12,top21,top22
    1415             :       call obj%timer%start("qr_pdlarfg2_1dcomm_check_&
    1416             :           &PRECISION&
    1417           0 :           &")
    1418             : 
    1419           0 :       EPSD = EPS
    1420             : 
    1421           0 :       top11 = seed(1)
    1422           0 :       dot11 = seed(2)
    1423           0 :       top12 = seed(3)
    1424           0 :       dot12 = seed(4)
    1425             : 
    1426           0 :       top21 = seed(5)
    1427           0 :       top22 = seed(6)
    1428           0 :       dot22 = seed(7)
    1429             : 
    1430             :       ! reconstruct the whole inner products
    1431             :       ! (including squares of the top elements)
    1432           0 :       first = dot11 + top11*top11
    1433           0 :       second = dot22 + top22*top22 + top12*top12
    1434           0 :       first_second = dot12 + top11*top12
    1435             : 
    1436             :       ! zero Householder Vector (zero norm) case
    1437             : #ifdef DOUBLE_PRECISION_REAL
    1438           0 :       if (first*second .eq. 0.0_rk8) then
    1439             : #else
    1440           0 :       if (first*second .eq. 0.0_rk4) then
    1441             : #endif
    1442             :         qr_pdlarfg2_1dcomm_check_&
    1443             : &PRECISION &
    1444           0 :  = .false.
    1445             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
    1446             :           &PRECISION&
    1447           0 :           &")
    1448             : 
    1449           0 :         return
    1450             :       end if
    1451             : 
    1452           0 :       estimate = abs((first_second*first_second)/(first*second))
    1453             : 
    1454             :       !print *,'estimate:',estimate
    1455             : 
    1456             :       ! if accurate the following check holds
    1457             : #ifdef DOUBLE_PRECISION_REAL
    1458           0 :       accurate = (estimate .LE. (epsd/(1.0_rk8+epsd)))
    1459             : #else
    1460           0 :       accurate = (estimate .LE. (epsd/(1.0_rk4+epsd)))
    1461             : #endif
    1462             :       qr_pdlarfg2_1dcomm_check_&
    1463             : &PRECISION &
    1464           0 :  = accurate
    1465             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
    1466             :           &PRECISION&
    1467           0 :           &")
    1468             : 
    1469           0 :     end function
    1470             : 
    1471             :     ! id=0: first Vector
    1472             :     ! id=1: second Vector
    1473             :     subroutine qr_pdlarfg2_1dcomm_vector_&
    1474           0 : &PRECISION &
    1475             : (obj,x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
    1476             :       use precision
    1477             :       use elpa1_impl
    1478             :       use qr_utils_mod
    1479             :       use elpa_abstract_impl
    1480             :       implicit none
    1481             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1482             :       ! input variables (local)
    1483             :       integer(kind=ik)        :: incx
    1484             :       real(kind=C_DATATYPE_KIND)           :: x(*),seed(*),tau
    1485             : 
    1486             :       ! input variables (global)
    1487             :       integer(kind=ik)        :: n,nb,idx,id,rev,mpicomm
    1488             : 
    1489             :       ! output variables (global)
    1490             : 
    1491             :       ! external functions
    1492             : #ifdef DOUBLE_PRECISION_REAL
    1493             :       real(kind=C_DATATYPE_KIND), external :: dlapy2
    1494             :       external                :: dscal
    1495             : #else
    1496             :       real(kind=rk4), external :: slapy2
    1497             :       external                :: sscal
    1498             : #endif
    1499             :       ! local scalars
    1500             :       integer(kind=ik)        :: mpirank,mpirank_top,mpiprocs,mpierr
    1501             :       real(kind=C_DATATYPE_KIND)           :: alpha,dot,beta,xnorm
    1502             :       integer(kind=ik)        :: local_size,baseoffset,local_offset,top,topidx
    1503             :       call obj%timer%start("qr_pdlarfg2_1dcomm_vector_&
    1504             :           &PRECISION&
    1505           0 :           &")
    1506             : 
    1507           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    1508           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    1509             :       call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
    1510           0 :                                     local_size,baseoffset,local_offset)
    1511             : 
    1512           0 :       local_offset = local_offset * incx
    1513             : 
    1514             :       ! Processor id for global index of top element
    1515           0 :       mpirank_top = MOD((idx-1)/nb,mpiprocs)
    1516           0 :       if (mpirank .eq. mpirank_top) then
    1517           0 :         topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
    1518           0 :         top = 1+(topidx-1)*incx
    1519             :       else
    1520           0 :         top = -99
    1521           0 :         stop
    1522             :       end if
    1523             : 
    1524           0 :       alpha = seed(id*5+1)
    1525           0 :       dot = seed(id*5+2)
    1526             : 
    1527           0 :       xnorm = sqrt(dot)
    1528             : #ifdef DOUBLE_PRECISION_REAL
    1529           0 :       if (xnorm .eq. 0.0_rk8) then
    1530             :         ! H = I
    1531             : 
    1532           0 :         tau = 0.0_rk8
    1533             : #else
    1534           0 :       if (xnorm .eq. 0.0_rk4) then
    1535             :         ! H = I
    1536             : 
    1537           0 :         tau = 0.0_rk4
    1538             : #endif
    1539             :       else
    1540             :         ! General case
    1541             : #ifdef DOUBLE_PRECISION_REAL
    1542           0 :         beta = sign(dlapy2(alpha, xnorm), alpha)
    1543             : #else
    1544           0 :         beta = sign(slapy2(alpha, xnorm), alpha)
    1545             : #endif
    1546           0 :         tau = (beta+alpha) / beta
    1547             : 
    1548             :         !print *,'hg2',tau,xnorm,alpha
    1549             : #ifdef DOUBLE_PRECISION_REAL
    1550             :         call dscal(local_size, 1.0_rk8/(beta+alpha), &
    1551           0 :                    x(local_offset), incx)
    1552             : #else
    1553             :         call sscal(local_size, 1.0_rk4/(beta+alpha), &
    1554           0 :                    x(local_offset), incx)
    1555             : #endif
    1556             : 
    1557             :         ! TODO: reimplement norm rescale method of
    1558             :         ! original PDLARFG using mpi?
    1559             : 
    1560           0 :         if (mpirank .eq. mpirank_top) then
    1561           0 :           x(top) = -beta
    1562             :         end if
    1563             : 
    1564           0 :         seed(8) = beta
    1565             :       end if
    1566             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_vector_&
    1567             :           &PRECISION&
    1568           0 :           &")
    1569             : 
    1570           0 :     end subroutine
    1571             : 
    1572             :     subroutine qr_pdlarfg2_1dcomm_update_&
    1573           0 : &PRECISION &
    1574           0 : (obj,v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
    1575             :       use precision
    1576             :       use elpa1_impl
    1577             :       use qr_utils_mod
    1578             :       use elpa_abstract_impl
    1579             :       implicit none
    1580             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1581             :       ! input variables (local)
    1582             :       integer(kind=ik)   :: incv,lda
    1583             :       real(kind=C_DATATYPE_KIND)      :: v(*),a(lda,*),seed(*)
    1584             : 
    1585             :       ! input variables (global)
    1586             :       integer(kind=ik)   :: n,baseidx,idx,nb,rev,mpicomm
    1587             : 
    1588             :       ! output variables (global)
    1589             : 
    1590             :       ! external functions
    1591             :       external daxpy
    1592             : 
    1593             :       ! local scalars
    1594             :       integer(kind=ik)   :: mpirank,mpiprocs,mpierr
    1595             :       integer(kind=ik)   :: local_size,local_offset,baseoffset
    1596             :       real(kind=C_DATATYPE_KIND)      :: z,coeff,beta
    1597             :       real(kind=C_DATATYPE_KIND)      :: dot11,dot12,dot22
    1598             :       real(kind=C_DATATYPE_KIND)      :: top11,top12,top21,top22
    1599             :       call obj%timer%start("qr_pdlarfg2_1dcomm_update_&
    1600             :           &PRECISION&
    1601           0 :           &")
    1602             : 
    1603           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    1604           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    1605             : 
    1606             :       ! seed should be updated by previous householder generation
    1607             :       ! Update inner product of this column and next column Vector
    1608           0 :       top11 = seed(1)
    1609           0 :       dot11 = seed(2)
    1610           0 :       top12 = seed(3)
    1611           0 :       dot12 = seed(4)
    1612             : 
    1613           0 :       top21 = seed(5)
    1614           0 :       top22 = seed(6)
    1615           0 :       dot22 = seed(7)
    1616           0 :       beta = seed(8)
    1617             : 
    1618             :       call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
    1619           0 :                                 local_size,baseoffset,local_offset)
    1620           0 :       baseoffset = baseoffset * incv
    1621             : 
    1622             :       ! zero Householder Vector (zero norm) case
    1623             : #ifdef DOUBLE_PRECISION_REAL
    1624           0 :       if (beta .eq. 0.0_rk8) then
    1625             : #else
    1626           0 :       if (beta .eq. 0.0_rk4) then
    1627             : #endif
    1628             : 
    1629             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
    1630             :           &PRECISION&
    1631           0 :           &")
    1632           0 :         return
    1633             :       end if
    1634           0 :       z = (dot12 + top11 * top12) / beta + top12
    1635             : 
    1636             :       !print *,'hg2 update:',baseidx,idx,mpirank,local_size
    1637             : #ifdef DOUBLE_PRECISION_REAL
    1638           0 :       call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
    1639             : #else
    1640           0 :       call saxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
    1641             : #endif
    1642             :       ! prepare a full dot22 for update
    1643           0 :       dot22 = dot22 + top22*top22
    1644             : 
    1645             :       ! calculate coefficient
    1646           0 :       COEFF = z / (top11 + beta)
    1647             : 
    1648             :       ! update inner product of next Vector
    1649           0 :       dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)
    1650             : 
    1651             :       ! update dot12 value to represent update with first Vector
    1652             :       ! (needed for T matrix)
    1653           0 :       dot12 = dot12 - COEFF * dot11
    1654             : 
    1655             :       ! update top element of next Vector
    1656           0 :       top22 = top22 - coeff * top21
    1657           0 :       seed(6) = top22
    1658             : 
    1659             :       ! restore separated dot22 for Vector generation
    1660           0 :       seed(7) = dot22  - top22*top22
    1661             : 
    1662             :       !------------------------------------------------------
    1663             :       ! prepare elements for T matrix
    1664           0 :       seed(4) = dot12
    1665             : 
    1666             :       ! prepare dot matrix for fuse element of T matrix
    1667             :       ! replace top11 value with -beta1
    1668           0 :       seed(1) = beta
    1669             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
    1670             :           &PRECISION&
    1671           0 :           &")
    1672             : 
    1673             :     end subroutine
    1674             : 
    1675             :     ! run this function after second Vector
    1676             :     subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix_&
    1677           0 : &PRECISION &
    1678           0 : (obj,seed,tau,t,ldt)
    1679             :       use precision
    1680             :       use elpa_abstract_impl
    1681             :       implicit none
    1682             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1683             : 
    1684             :       integer(kind=ik)  :: ldt
    1685             :       real(kind=C_DATATYPE_KIND)     :: seed(*),t(ldt,*),tau(*)
    1686             :       real(kind=C_DATATYPE_KIND)     :: dot12,beta1,top21,beta2
    1687             :       call obj%timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
    1688             :           &PRECISION&
    1689           0 :           &")
    1690             : 
    1691           0 :       beta1 = seed(1)
    1692           0 :       dot12 = seed(4)
    1693           0 :       top21 = seed(5)
    1694           0 :       beta2 = seed(8)
    1695             : 
    1696             :       !print *,'beta1 beta2',beta1,beta2
    1697             : 
    1698           0 :       dot12 = dot12 / beta2 + top21
    1699           0 :       dot12 = -(dot12 / beta1)
    1700             : 
    1701           0 :       t(1,1) = tau(1)
    1702           0 :       t(1,2) = dot12
    1703           0 :       t(2,2) = tau(2)
    1704             :       call obj%timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
    1705             :           &PRECISION&
    1706           0 :           &")
    1707             : 
    1708           0 :     end subroutine
    1709             : 
    1710             :     subroutine qr_pdlarfgk_1dcomm_&
    1711           0 : &PRECISION &
    1712           0 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
    1713             :       use precision
    1714             :       use elpa_abstract_impl
    1715             :       implicit none
    1716             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1717             :       ! parameter setup
    1718             : 
    1719             :       ! input variables (local)
    1720             :       integer(kind=ik)    :: lda,lwork,ldv,ldt
    1721             :       real(kind=C_DATATYPE_KIND)       :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
    1722             : 
    1723             :       ! input variables (global)
    1724             :       integer(kind=ik)    :: m,k,idx,baseidx,mb,rev,mpicomm
    1725             : #ifdef USE_ASSUMED_SIZE_QR
    1726             :       integer(kind=ik)    ::PQRPARAM(*)
    1727             : #else
    1728             :       integer(kind=ik)    :: PQRPARAM(:)
    1729             : #endif
    1730             :       ! output variables (global)
    1731             :       integer(kind=ik)    :: actualk
    1732             : 
    1733             :       ! local scalars
    1734             :       integer(kind=ik)    :: ivector
    1735             :       real(kind=C_DATATYPE_KIND)       :: pdlarfg_size(1),pdlarf_size(1)
    1736             :       real(kind=C_DATATYPE_KIND)       :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
    1737             :       real(kind=C_DATATYPE_KIND)       :: pdlarfgk_1dcomm_update_size(1)
    1738             :       integer(kind=ik)    :: seedC_size,seedC_offset
    1739             :       integer(kind=ik)    :: seedD_size,seedD_offset
    1740             :       integer(kind=ik)    :: work_offset
    1741             :       call obj%timer%start("qr_pdlarfgk_1dcomm_&
    1742             :           &PRECISION&
    1743           0 :           &")
    1744             : 
    1745           0 :       seedC_size = k*k
    1746           0 :       seedC_offset = 1
    1747           0 :       seedD_size = k*k
    1748           0 :       seedD_offset = seedC_offset + seedC_size
    1749           0 :       work_offset = seedD_offset + seedD_size
    1750             : 
    1751           0 :       if (lwork .eq. -1) then
    1752             :         call qr_pdlarfg_1dcomm_&
    1753             : &PRECISION &
    1754           0 : (obj, a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
    1755             : 
    1756             :         call qr_pdlarfl_1dcomm_&
    1757             : &PRECISION &
    1758           0 : (v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
    1759             :         call qr_pdlarfgk_1dcomm_seed_&
    1760             : &PRECISION &
    1761           0 : (obj,a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
    1762             : #ifdef USE_ASSUMED_SIZE_QR
    1763             :         !call qr_pdlarfgk_1dcomm_check_&
    1764             : !&PRECISION &
    1765             : !(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
    1766             :         call qr_pdlarfgk_1dcomm_check_improved_&
    1767             : &PRECISION &
    1768             : (obj,work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
    1769             : #else
    1770             :         !call qr_pdlarfgk_1dcomm_check_&
    1771             : !&PRECISION &
    1772             : !(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
    1773             :         call qr_pdlarfgk_1dcomm_check_improved_&
    1774             : &PRECISION &
    1775           0 : (obj,work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
    1776             : #endif
    1777             :         call qr_pdlarfgk_1dcomm_update_&
    1778             : &PRECISION &
    1779             : (obj,a,lda,baseidx,pdlarfgk_1dcomm_update_size(1), &
    1780           0 :                                               -1,work,work,k,k,1,work,m,mb,rev,mpicomm)
    1781             :         work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
    1782           0 :                         pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=C_DATATYPE_KIND)
    1783             : 
    1784             :       call obj%timer%stop("qr_pdlarfgk_1dcomm_&
    1785             :           &PRECISION&
    1786           0 :           &")
    1787             : 
    1788           0 :         return
    1789             :       end if
    1790             : 
    1791             :       call qr_pdlarfgk_1dcomm_seed_&
    1792             : &PRECISION &
    1793             : (obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
    1794           0 :           work(seedD_offset),m,k,mb,mpicomm)
    1795             : #ifdef USE_ASSUMED_SIZE_QR
    1796             :       !call qr_pdlarfgk_1dcomm_check_&
    1797             : !&PRECISION &
    1798             : !(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
    1799             :       call qr_pdlarfgk_1dcomm_check_improved_&
    1800             : &PRECISION &
    1801             : (obj,work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
    1802             : 
    1803             : #else
    1804             :       !call qr_pdlarfgk_1dcomm_check_&
    1805             : !&PRECISION &
    1806             : !(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
    1807             :       call qr_pdlarfgk_1dcomm_check_improved_&
    1808             : &PRECISION &
    1809             : (obj,work(seedC_offset),work(seedD_offset), &
    1810           0 :           k,PQRPARAM(:),work(work_offset),lwork,actualk)
    1811             : #endif
    1812             :       !print *,'possible rank:', actualk
    1813             : 
    1814             :       ! override useful for debugging
    1815             :       !actualk = 1
    1816             :       !actualk = k
    1817             :       !actualk= min(actualk,2)
    1818           0 :       do ivector=1,actualk
    1819             :         call qr_pdlarfgk_1dcomm_vector_&
    1820             : &PRECISION &
    1821             : (obj,a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
    1822             :                                           work(seedC_offset),work(seedD_offset),k, &
    1823           0 :                                           ivector,m,mb,rev,mpicomm)
    1824             : 
    1825             :         call qr_pdlarfgk_1dcomm_update_&
    1826             : &PRECISION &
    1827             : (obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
    1828             :                                           work(seedD_offset),k,actualk,ivector,tau, &
    1829           0 :                                           m,mb,rev,mpicomm)
    1830             : 
    1831             :         call qr_pdlarfg_copy_1dcomm_&
    1832             : &PRECISION &
    1833             : (obj,a(1,k-ivector+1),1, &
    1834             :                                        v(1,k-ivector+1),1, &
    1835           0 :                                        m,baseidx,idx-ivector+1,mb,1,mpicomm)
    1836             :       end do
    1837             : 
    1838             :       ! generate final T matrix and convert preliminary tau values into real ones
    1839             :       call qr_pdlarfgk_1dcomm_generateT_&
    1840             : &PRECISION &
    1841           0 : (obj,work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)
    1842             : 
    1843             :       call obj%timer%stop("qr_pdlarfgk_1dcomm_&
    1844             :           &PRECISION&
    1845           0 :           &")
    1846             :     end subroutine
    1847             : 
    1848             :     subroutine qr_pdlarfgk_1dcomm_seed_&
    1849           0 : &PRECISION &
    1850           0 : (obj,a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
    1851             :       use precision
    1852             :       use elpa1_impl
    1853             :       use qr_utils_mod
    1854             :       use elpa_abstract_impl
    1855             :       implicit none
    1856             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    1857             :       ! parameter setup
    1858             : 
    1859             :       ! input variables (local)
    1860             :       integer(kind=ik)   :: lda,lwork
    1861             :       real(kind=C_DATATYPE_KIND)      :: a(lda,*), work(*)
    1862             : 
    1863             :       ! input variables (global)
    1864             :       integer(kind=ik)   :: m,k,baseidx,mb,mpicomm
    1865             :       real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*)
    1866             : 
    1867             :       ! output variables (global)
    1868             : 
    1869             :       ! derived input variables from QR_PQRPARAM
    1870             : 
    1871             :       ! local scalars
    1872             :       integer(kind=ik)   :: mpierr,mpirank,mpiprocs,mpirank_top
    1873             :       integer(kind=ik)   :: icol,irow,lidx,remsize
    1874             :       integer(kind=ik)   :: remaining_rank
    1875             : 
    1876             :       integer(kind=ik)   :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
    1877             :       integer(kind=ik)   :: localoffset,localsize,baseoffset
    1878             :       call obj%timer%start("qr_pdlarfgk_1dcomm_seed_&
    1879             :           &PRECISION&
    1880           0 :           &")
    1881             : 
    1882           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    1883           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    1884           0 :       C_size = k*k
    1885           0 :       D_size = k*k
    1886           0 :       sendoffset = 1
    1887           0 :       sendrecv_size = C_size+D_size
    1888           0 :       recvoffset = sendoffset + sendrecv_size
    1889             : 
    1890           0 :       if (lwork .eq. -1) then
    1891             : #ifdef DOUBLE_PRECISION_REAL
    1892           0 :         work(1) = real(2*sendrecv_size,kind=C_DATATYPE_KIND)
    1893             : #else
    1894           0 :         work(1) = real(2*sendrecv_size,kind=rk4)
    1895             : #endif
    1896             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
    1897             :           &PRECISION&
    1898           0 :           &")
    1899             : 
    1900           0 :         return
    1901             :       end if
    1902             : 
    1903             :       ! clear buffer
    1904             : #ifdef DOUBLE_PRECISION_REAL
    1905           0 :       work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk8
    1906             : #else
    1907           0 :       work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk4
    1908             : #endif
    1909             :       ! collect C part
    1910           0 :       do icol=1,k
    1911             : 
    1912           0 :         remaining_rank = k
    1913           0 :         do while (remaining_rank .gt. 0)
    1914           0 :           irow = k - remaining_rank + 1
    1915           0 :           lidx = baseidx - remaining_rank + 1
    1916             : 
    1917             :           ! determine chunk where the current top element is located
    1918           0 :           mpirank_top = MOD((lidx-1)/mb,mpiprocs)
    1919             : 
    1920             :           ! limit max number of remaining elements of this chunk to the block
    1921             :           ! distribution parameter
    1922           0 :           remsize = min(remaining_rank,mb)
    1923             : 
    1924             :           ! determine the number of needed elements in this chunk
    1925             :           call local_size_offset_1d(lidx+remsize-1,mb, &
    1926             :                                     lidx,lidx,0, &
    1927             :                                     mpirank_top,mpiprocs, &
    1928           0 :                                     localsize,baseoffset,localoffset)
    1929             : 
    1930             :           !print *,'local rank',localsize,localoffset
    1931             : 
    1932           0 :           if (mpirank .eq. mpirank_top) then
    1933             :             ! copy elements to buffer
    1934             :             work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
    1935           0 :                           = a(localoffset:localoffset+remsize-1,icol)
    1936             :           end if
    1937             : 
    1938             :           ! jump to next chunk
    1939           0 :           remaining_rank = remaining_rank - localsize
    1940             :         end do
    1941             :       end do
    1942             : 
    1943             :       ! collect D part
    1944             :       call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
    1945             :                         mpirank,mpiprocs, &
    1946           0 :                         localsize,baseoffset,localoffset)
    1947             : 
    1948             :       !print *,'localsize',localsize,localoffset
    1949             : #ifdef DOUBLE_PRECISION_REAL
    1950           0 :       if (localsize > 0) then
    1951             :         call dsyrk("Upper", "Trans", k, localsize, &
    1952             :                      1.0_rk8, a(localoffset,1), lda, &
    1953           0 :                      0.0_rk8, work(sendoffset+C_size), k)
    1954             :       else
    1955           0 :         work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk8
    1956             :       end if
    1957             : #else
    1958           0 :       if (localsize > 0) then
    1959             :         call ssyrk("Upper", "Trans", k, localsize, &
    1960             :                      1.0_rk4, a(localoffset,1), lda, &
    1961           0 :                      0.0_rk4, work(sendoffset+C_size), k)
    1962             :       else
    1963           0 :         work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk4
    1964             :       end if
    1965             : #endif
    1966             : 
    1967             :       ! TODO: store symmetric part more efficiently
    1968             : 
    1969             :       ! allreduce operation on results
    1970             : #ifdef WITH_MPI
    1971             : 
    1972             : #ifdef DOUBLE_PRECISION_REAL
    1973             :       call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
    1974           0 :                          mpi_real8,mpi_sum,mpicomm,mpierr)
    1975             : #else
    1976             :       call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
    1977           0 :                          mpi_real4,mpi_sum,mpicomm,mpierr)
    1978             : #endif
    1979             : 
    1980             : #else
    1981           0 :       work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
    1982             : #endif
    1983             :       ! unpack result from buffer into seedC and seedD
    1984             : #ifdef DOUBLE_PRECISION_REAL
    1985           0 :       seedC(1:k,1:k) = 0.0_rk8
    1986             : #else
    1987           0 :       seedC(1:k,1:k) = 0.0_rk4
    1988             : #endif
    1989           0 :       do icol=1,k
    1990           0 :         seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
    1991             :       end do
    1992             : #ifdef DOUBLE_PRECISION_REAL
    1993           0 :       seedD(1:k,1:k) = 0.0_rk8
    1994             : #else
    1995           0 :       seedD(1:k,1:k) = 0.0_rk4
    1996             : #endif
    1997           0 :       do icol=1,k
    1998           0 :         seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
    1999             :       end do
    2000             : 
    2001             :       call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
    2002             :           &PRECISION&
    2003           0 :           &")
    2004             : 
    2005             :     end subroutine
    2006             : 
    2007             :     ! k is assumed to be larger than two
    2008             :     subroutine qr_pdlarfgk_1dcomm_check_improved_&
    2009           0 : &PRECISION &
    2010           0 : (obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
    2011             :       use precision
    2012             :       use elpa_abstract_impl
    2013             :       implicit none
    2014             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2015             :       ! input variables (global)
    2016             :       integer(kind=ik)   :: k,lwork
    2017             : #ifdef USE_ASSUMED_SIZE_QR
    2018             :       integer(kind=ik)   :: PQRPARAM(*)
    2019             : 
    2020             : #else
    2021             :       integer(kind=ik)   :: PQRPARAM(:)
    2022             : #endif
    2023             :       real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*),work(k,*)
    2024             : 
    2025             :       ! output variables (global)
    2026             :       integer(kind=ik)   :: possiblerank
    2027             : 
    2028             :       ! derived input variables from QR_PQRPARAM
    2029             :       integer(kind=ik)   :: eps
    2030             : 
    2031             :       ! local variables
    2032             :       integer(kind=ik)   :: i,j,l
    2033             :       real(kind=C_DATATYPE_KIND)      :: sum_squares,diagonal_square,epsd,diagonal_root
    2034             :       real(kind=C_DATATYPE_KIND)      :: dreverse_matrix_work(1)
    2035             : 
    2036             :       ! external functions
    2037             : #ifdef DOUBLE_PRECISION_REAL
    2038             :       real(kind=C_DATATYPE_KIND), external :: ddot,dlapy2,dnrm2
    2039             :       external                :: dscal
    2040             : #else
    2041             :       real(kind=rk4), external :: sdot,slapy2,snrm2
    2042             :       external                :: sscal
    2043             : #endif
    2044             : 
    2045             :       call obj%timer%start("qr_pdlarfgk_1dcomm_check_improved_&
    2046             :           &PRECISION&
    2047           0 :           &")
    2048             : 
    2049           0 :       if (lwork .eq. -1) then
    2050             :         call reverse_matrix_local_&
    2051             : &PRECISION &
    2052           0 : (1,k,k,work,k,dreverse_matrix_work,-1)
    2053             : #ifdef DOUBLE_PRECISION_REAL
    2054           0 :         work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
    2055             : #else
    2056           0 :         work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
    2057             : #endif
    2058             : 
    2059             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
    2060             :             &PRECISION&
    2061           0 :             &")
    2062           0 :         return
    2063             :       end if
    2064             : 
    2065           0 :       eps = PQRPARAM(3)
    2066             : 
    2067           0 :       if (eps .eq. 0) then
    2068           0 :         possiblerank = k
    2069             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
    2070             :             &PRECISION&
    2071           0 :             &")
    2072           0 :         return
    2073             :       end if
    2074             : #ifdef DOUBLE_PRECISION_REAL
    2075           0 :       epsd = real(eps,kind=C_DATATYPE_KIND)
    2076             : #else
    2077           0 :       epsd = real(eps,kind=rk4)
    2078             : #endif
    2079             : 
    2080             :       ! build complete inner product from seedC and seedD
    2081             :       ! copy seedD to work
    2082           0 :       work(:,1:k) = seedD(:,1:k)
    2083             : 
    2084             :       ! add inner products of seedC to work
    2085             : #ifdef DOUBLE_PRECISION_REAL
    2086             :       call dsyrk("Upper", "Trans", k, k, &
    2087             :                  1.0_rk8, seedC(1,1), k, &
    2088           0 :                  1.0_rk8, work, k)
    2089             : #else
    2090             :       call ssyrk("Upper", "Trans", k, k, &
    2091             :                  1.0_rk4, seedC(1,1), k, &
    2092           0 :                  1.0_rk4, work, k)
    2093             : 
    2094             : #endif
    2095             : 
    2096             :       ! TODO: optimize this part!
    2097             :       call reverse_matrix_local_&
    2098             : &PRECISION &
    2099           0 : (0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
    2100             :       call reverse_matrix_local_&
    2101             : &PRECISION &
    2102           0 : (1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
    2103             : 
    2104             :       ! transpose matrix
    2105           0 :       do i=1,k
    2106           0 :         do j=i+1,k
    2107           0 :           work(i,j) = work(j,i)
    2108             :         end do
    2109             :       end do
    2110             : 
    2111             : 
    2112             :       ! do cholesky decomposition
    2113           0 :       i = 0
    2114           0 :       do while ((i .lt. k))
    2115           0 :         i = i + 1
    2116             : 
    2117           0 :         diagonal_square = abs(work(i,i))
    2118           0 :         diagonal_root  = sqrt(diagonal_square)
    2119             : 
    2120             :         ! zero Householder Vector (zero norm) case
    2121             : #ifdef DOUBLE_PRECISION_REAL
    2122           0 :         if ((abs(diagonal_square) .eq. 0.0_rk8) .or. (abs(diagonal_root) .eq. 0.0_rk8)) then
    2123             : #else
    2124           0 :         if ((abs(diagonal_square) .eq. 0.0_rk4) .or. (abs(diagonal_root) .eq. 0.0_rk4)) then
    2125             : #endif
    2126           0 :           possiblerank = max(i-1,1)
    2127             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
    2128             :             &PRECISION&
    2129           0 :             &")
    2130           0 :           return
    2131             :         end if
    2132             : 
    2133             :         ! check if relative error is bounded for each Householder Vector
    2134             :         ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
    2135             :         ! holds.
    2136             :         ! first Householder Vector is considered as "stable".
    2137             : 
    2138           0 :         do j=i+1,k
    2139           0 :           work(i,j) = work(i,j) / diagonal_root
    2140           0 :           do l=i+1,j
    2141           0 :             work(l,j) = work(l,j) - work(i,j) * work(i,l)
    2142             :           end do
    2143             :         end do
    2144             :         !print *,'cholesky step done'
    2145             : 
    2146             :         ! build sum of squares
    2147             : #ifdef DOUBLE_PRECISION_REAL
    2148           0 :         if (i .eq. 1) then
    2149           0 :           sum_squares = 0.0_rk8
    2150             :         else
    2151           0 :           sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
    2152             :         end if
    2153             : #else
    2154           0 :         if (i .eq. 1) then
    2155           0 :           sum_squares = 0.0_rk4
    2156             :         else
    2157           0 :           sum_squares = sdot(i-1,work(1,i),1,work(1,i),1)
    2158             :         end if
    2159             : #endif
    2160           0 :         if (sum_squares .ge. (epsd * diagonal_square)) then
    2161           0 :           possiblerank = max(i-1,1)
    2162             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
    2163             :             &PRECISION&
    2164           0 :             &")
    2165           0 :           return
    2166             :         end if
    2167             :       end do
    2168             : 
    2169           0 :       possiblerank = i
    2170             :       !print *,'possible rank', possiblerank
    2171             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
    2172             :             &PRECISION&
    2173           0 :             &")
    2174             : 
    2175             :     end subroutine
    2176             : 
    2177             :     ! TODO: zero Householder Vector (zero norm) case
    2178             :     ! - check alpha values as well (from seedC)
    2179             :     subroutine qr_pdlarfgk_1dcomm_check_&
    2180           0 : &PRECISION &
    2181           0 : (obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
    2182             :       use precision
    2183             :       use qr_utils_mod
    2184             :       use elpa_abstract_impl
    2185             :       implicit none
    2186             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2187             :       ! parameter setup
    2188             : 
    2189             :       ! input variables (local)
    2190             : 
    2191             :       ! input variables (global)
    2192             :       integer(kind=ik)   :: k,lwork
    2193             : #ifdef USE_ASSUMED_SIZE_QR
    2194             :       integer(kind=ik)   :: PQRPARAM(*)
    2195             : #else
    2196             :       integer(kind=ik)   :: PQRPARAM(:)
    2197             : #endif
    2198             :       real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*),work(k,*)
    2199             : 
    2200             :       ! output variables (global)
    2201             :       integer(kind=ik)   :: possiblerank
    2202             : 
    2203             :       ! derived input variables from QR_PQRPARAM
    2204             :       integer(kind=ik)   :: eps
    2205             : 
    2206             :       ! local scalars
    2207             :       integer(kind=ik)   :: icol,isqr,iprod
    2208             :       real(kind=C_DATATYPE_KIND)      :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
    2209             :       real(kind=C_DATATYPE_KIND)      :: dreverse_matrix_work(1)
    2210             :         call obj%timer%start("qr_pdlarfgk_1dcomm_check_&
    2211             :             &PRECISION&
    2212           0 :             &")
    2213           0 :       if (lwork .eq. -1) then
    2214             :         call reverse_matrix_local_&
    2215             : &PRECISION &
    2216           0 : (1,k,k,work,k,dreverse_matrix_work,-1)
    2217             : #ifdef DOUBLE_PRECISION_REAL
    2218           0 :         work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
    2219             : #else
    2220           0 :         work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
    2221             : #endif
    2222             : 
    2223             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2224             :             &PRECISION&
    2225           0 :             &")
    2226             : 
    2227           0 :         return
    2228             :       end if
    2229             : 
    2230           0 :       eps = PQRPARAM(3)
    2231             : 
    2232           0 :       if (eps .eq. 0) then
    2233           0 :         possiblerank = k
    2234             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2235             :             &PRECISION&
    2236           0 :             &")
    2237           0 :         return
    2238             :       end if
    2239             : #ifdef DOUBLE_PRECISION_REAL
    2240           0 :       epsd = real(eps,kind=C_DATATYPE_KIND)
    2241             : #else
    2242           0 :       epsd = real(eps,kind=rk4)
    2243             : #endif
    2244             : 
    2245             :       ! copy seedD to work
    2246           0 :       work(:,1:k) = seedD(:,1:k)
    2247             : 
    2248             :       ! add inner products of seedC to work
    2249             : #ifdef DOUBLE_PRECISION_REAL
    2250             :       call dsyrk("Upper", "Trans", k, k, &
    2251             :                  1.0_rk8, seedC(1,1), k, &
    2252           0 :                  1.0_rk8, work, k)
    2253             : #else
    2254             :       call ssyrk("Upper", "Trans", k, k, &
    2255             :                  1.0_rk4, seedC(1,1), k, &
    2256           0 :                  1.0_rk4, work, k)
    2257             : #endif
    2258             : 
    2259             :       ! TODO: optimize this part!
    2260             :       call reverse_matrix_local_&
    2261             : &PRECISION &
    2262           0 : (0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
    2263             :       call reverse_matrix_local_&
    2264             : &PRECISION &
    2265           0 : (1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
    2266             : 
    2267             :       ! transpose matrix
    2268           0 :       do icol=1,k
    2269           0 :         do isqr=icol+1,k
    2270           0 :           work(icol,isqr) = work(isqr,icol)
    2271             :         end do
    2272             :       end do
    2273             : 
    2274             :       ! work contains now the full inner product of the global (sub-)matrix
    2275           0 :       do icol=1,k
    2276             :         ! zero Householder Vector (zero norm) case
    2277             : #ifdef DOUBLE_PRECISION_REAL
    2278           0 :         if (abs(work(icol,icol)) .eq. 0.0_rk8) then
    2279             : #else
    2280           0 :         if (abs(work(icol,icol)) .eq. 0.0_rk4) then
    2281             : #endif
    2282             :           !print *,'too small ', icol, work(icol,icol)
    2283           0 :           possiblerank = max(icol,1)
    2284             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2285             :             &PRECISION&
    2286           0 :             &")
    2287           0 :           return
    2288             :         end if
    2289             : 
    2290             : #ifdef DOUBLE_PRECISION_REAL
    2291           0 :         sum_sqr = 0.0_rk8
    2292           0 :         do isqr=1,icol-1
    2293           0 :           sum_products = 0.0_rk8
    2294             : #else
    2295           0 :         sum_sqr = 0.0_rk4
    2296           0 :         do isqr=1,icol-1
    2297           0 :           sum_products = 0.0_rk4
    2298             : #endif
    2299           0 :           do iprod=1,isqr-1
    2300           0 :             sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
    2301             :           end do
    2302             : 
    2303             :           !print *,'divisor',icol,isqr,work(isqr,isqr)
    2304           0 :           temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
    2305           0 :           work(isqr,icol) = temp
    2306           0 :           sum_sqr = sum_sqr + temp*temp
    2307             :         end do
    2308             : 
    2309             :         ! calculate diagonal value
    2310           0 :         diff = work(icol,icol) - sum_sqr
    2311             : #ifdef DOUBLE_PRECISION_REAL
    2312           0 :         if (diff .lt. 0.0_rk8) then
    2313             : #else
    2314           0 :         if (diff .lt. 0.0_rk4) then
    2315             : #endif
    2316             :           ! we definitely have a problem now
    2317           0 :           possiblerank = icol-1 ! only decompose to previous column (including)
    2318             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2319             :             &PRECISION&
    2320           0 :             &")
    2321           0 :           return
    2322             :         end if
    2323           0 :         work(icol,icol) = sqrt(diff)
    2324             :         ! calculate orthogonality
    2325             : #ifdef DOUBLE_PRECISION_REAL
    2326           0 :         ortho = 0.0_rk8
    2327           0 :         do isqr=1,icol-1
    2328           0 :           ortho_sum = 0.0_rk8
    2329             : #else
    2330           0 :         ortho = 0.0_rk4
    2331           0 :         do isqr=1,icol-1
    2332           0 :           ortho_sum = 0.0_rk4
    2333             : #endif
    2334           0 :           do iprod=isqr,icol-1
    2335           0 :             temp = work(isqr,iprod)*work(isqr,iprod)
    2336             :             !print *,'ortho ', work(iprod,iprod)
    2337           0 :             temp = temp / (work(iprod,iprod)*work(iprod,iprod))
    2338           0 :             ortho_sum = ortho_sum + temp
    2339             :           end do
    2340           0 :           ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
    2341             :         end do
    2342             : 
    2343             :         ! ---------------- with division by zero ----------------------- !
    2344             : 
    2345             :         !ortho = ortho / diff;
    2346             : 
    2347             :         ! if current estimate is not accurate enough, the following check holds
    2348             :         !if (ortho .gt. epsd) then
    2349             :         !    possiblerank = icol-1 ! only decompose to previous column (including)
    2350             :         !    return
    2351             :         !end if
    2352             : 
    2353             :         ! ---------------- without division by zero ----------------------- !
    2354             : 
    2355             :         ! if current estimate is not accurate enough, the following check holds
    2356           0 :         if (ortho .gt. epsd * diff) then
    2357           0 :           possiblerank = icol-1 ! only decompose to previous column (including)
    2358             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2359             :             &PRECISION&
    2360           0 :             &")
    2361           0 :           return
    2362             :         end if
    2363             :       end do
    2364             : 
    2365             :       ! if we get to this point, the accuracy condition holds for the whole block
    2366           0 :       possiblerank = k
    2367             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
    2368             :             &PRECISION&
    2369           0 :             &")
    2370             :       end subroutine
    2371             : 
    2372             :     !sidx: seed idx
    2373             :     !k: max rank used during seed phase
    2374             :     !rank: actual rank (k >= rank)
    2375             :     subroutine qr_pdlarfgk_1dcomm_vector_&
    2376           0 : &PRECISION &
    2377           0 : (obj,x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
    2378             :       use precision
    2379             :       use elpa1_impl
    2380             :       use qr_utils_mod
    2381             :       use elpa_abstract_impl
    2382             :       implicit none
    2383             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2384             :       ! input variables (local)
    2385             :       integer(kind=ik)  :: incx
    2386             :       real(kind=C_DATATYPE_KIND)     :: x(*),tau
    2387             : 
    2388             :       ! input variables (global)
    2389             :       integer(kind=ik)  :: n,nb,baseidx,rev,mpicomm,k,sidx
    2390             :       real(kind=C_DATATYPE_KIND)     :: seedC(k,*),seedD(k,*)
    2391             : 
    2392             :       ! output variables (global)
    2393             : 
    2394             :       ! external functions
    2395             : #ifdef DOUBLE_PRECISION_REAL
    2396             :       real(kind=C_DATATYPE_KIND), external :: dlapy2,dnrm2
    2397             :       external                :: dscal
    2398             : #else
    2399             :       real(kind=rk4), external :: slapy2,snrm2
    2400             :       external                :: sscal
    2401             : #endif
    2402             : 
    2403             :       ! local scalars
    2404             :       integer(kind=ik)   :: mpirank,mpirank_top,mpiprocs,mpierr
    2405             :       real(kind=C_DATATYPE_KIND)      :: alpha,dot,beta,xnorm
    2406             :       integer(kind=ik)   :: local_size,baseoffset,local_offset,top,topidx
    2407             :       integer(kind=ik)   :: lidx
    2408             :         call obj%timer%start("qr_pdlarfgk_1dcomm_vector_&
    2409             :             &PRECISION&
    2410           0 :             &")
    2411           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    2412           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    2413           0 :       lidx = baseidx-sidx+1
    2414             :       call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
    2415           0 :                         local_size,baseoffset,local_offset)
    2416             : 
    2417           0 :       local_offset = local_offset * incx
    2418             : 
    2419             :       ! Processor id for global index of top element
    2420           0 :       mpirank_top = MOD((lidx-1)/nb,mpiprocs)
    2421           0 :       if (mpirank .eq. mpirank_top) then
    2422           0 :         topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
    2423           0 :         top = 1+(topidx-1)*incx
    2424             :       end if
    2425             : 
    2426           0 :       alpha = seedC(k-sidx+1,k-sidx+1)
    2427           0 :       dot = seedD(k-sidx+1,k-sidx+1)
    2428             :       ! assemble actual norm from both seed parts
    2429             : #ifdef DOUBLE_PRECISION_REAL
    2430           0 :       xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))
    2431             : 
    2432           0 :       if (xnorm .eq. 0.0_rk8) then
    2433           0 :         tau = 0.0_rk8
    2434             :       else
    2435             : 
    2436             :         ! General case
    2437             : 
    2438           0 :         beta = sign(dlapy2(alpha, xnorm), alpha)
    2439             :         ! store a preliminary version of beta in tau
    2440           0 :         tau = beta
    2441             : 
    2442             :         ! update global part
    2443             :         call dscal(local_size, 1.0_rk8/(beta+alpha), &
    2444           0 :                      x(local_offset), incx)
    2445             : #else
    2446           0 :       xnorm = slapy2(sqrt(dot), snrm2(k-sidx,seedC(1,k-sidx+1),1))
    2447             : 
    2448           0 :       if (xnorm .eq. 0.0_rk4) then
    2449           0 :         tau = 0.0_rk4
    2450             :       else
    2451             : 
    2452             :         ! General case
    2453             : 
    2454           0 :         beta = sign(slapy2(alpha, xnorm), alpha)
    2455             :         ! store a preliminary version of beta in tau
    2456           0 :         tau = beta
    2457             : 
    2458             :         ! update global part
    2459             :         call sscal(local_size, 1.0_rk4/(beta+alpha), &
    2460           0 :                      x(local_offset), incx)
    2461             : 
    2462             : #endif
    2463             :         ! do not update local part here due to
    2464             :         ! dependency of c Vector during update process
    2465             : 
    2466             :         ! TODO: reimplement norm rescale method of
    2467             :         ! original PDLARFG using mpi?
    2468             : 
    2469           0 :         if (mpirank .eq. mpirank_top) then
    2470           0 :           x(top) = -beta
    2471             :         end if
    2472             :       end if
    2473             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_vector_&
    2474             :             &PRECISION&
    2475           0 :             &")
    2476             : 
    2477           0 :     end subroutine
    2478             : 
    2479             :     !k: original max rank used during seed function
    2480             :     !rank: possible rank as from check function
    2481             :     ! TODO: if rank is less than k, reduce buffersize in such a way
    2482             :     ! that only the required entries for the next pdlarfg steps are
    2483             :     ! computed
    2484             :     subroutine qr_pdlarfgk_1dcomm_update_&
    2485           0 : &PRECISION &
    2486           0 : (obj,a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
    2487             :       use precision
    2488             :       use elpa1_impl
    2489             :       use qr_utils_mod
    2490             :       use elpa_abstract_impl
    2491             :       implicit none
    2492             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2493             :       ! parameter setup
    2494             :       INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
    2495             : 
    2496             :       ! input variables (local)
    2497             :       integer(kind=ik)            :: lda,lwork
    2498             :       real(kind=C_DATATYPE_KIND)               :: a(lda,*),work(*)
    2499             : 
    2500             :       ! input variables (global)
    2501             :       integer(kind=ik)            :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
    2502             :       real(kind=C_DATATYPE_KIND)               :: beta
    2503             : 
    2504             :       ! output variables (global)
    2505             :       real(kind=C_DATATYPE_KIND)               :: seedC(k,*),seedD(k,*),tau(*)
    2506             : 
    2507             :       ! derived input variables from QR_PQRPARAM
    2508             : 
    2509             :       ! local scalars
    2510             :       real(kind=C_DATATYPE_KIND)               :: alpha
    2511             :       integer(kind=ik)            :: coffset,zoffset,yoffset,voffset,buffersize
    2512             :       integer(kind=ik)            :: mpirank,mpierr,mpiprocs,mpirank_top
    2513             :       integer(kind=ik)            :: localsize,baseoffset,localoffset,topidx
    2514             :       integer(kind=ik)            :: lidx
    2515             :         call obj%timer%start("qr_pdlarfgk_1dcomm_update_&
    2516             :             &PRECISION&
    2517           0 :             &")
    2518           0 :       if (lwork .eq. -1) then
    2519             :         ! buffer for c,z,y,v
    2520           0 :         work(1) = 4*k
    2521             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
    2522             :             &PRECISION&
    2523           0 :             &")
    2524             : 
    2525           0 :         return
    2526             :       end if
    2527             : 
    2528             :       ! nothing to update anymore
    2529           0 :       if (sidx .gt. rank) then
    2530             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
    2531             :             &PRECISION&
    2532           0 :             &")
    2533           0 :         return
    2534             :       endif
    2535           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    2536           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    2537           0 :       lidx = baseidx-sidx
    2538           0 :       if (lidx .lt. 1) then
    2539             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
    2540             :             &PRECISION&
    2541           0 :             &")
    2542           0 :         return
    2543             :       endif
    2544             : 
    2545             :       call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, &
    2546           0 :                                 localsize,baseoffset,localoffset)
    2547             : 
    2548           0 :       coffset = 1
    2549           0 :       zoffset = coffset + k
    2550           0 :       yoffset = zoffset + k
    2551           0 :       voffset = yoffset + k
    2552           0 :       buffersize = k - sidx
    2553             : 
    2554             :       ! finalize tau values
    2555           0 :       alpha = seedC(k-sidx+1,k-sidx+1)
    2556           0 :       beta = tau(k-sidx+1)
    2557             : 
    2558             :       ! zero Householder Vector (zero norm) case
    2559             :       !print *,'k update: alpha,beta',alpha,beta
    2560             : #ifdef DOUBLE_PRECISION_REAL
    2561           0 :       if ((beta .eq. 0.0_rk8) .or. (alpha .eq. 0.0_rk8))  then
    2562           0 :         tau(k-sidx+1) = 0.0_rk8
    2563           0 :         seedC(k,k-sidx+1) = 0.0_rk8
    2564             : #else
    2565           0 :       if ((beta .eq. 0.0_rk4) .or. (alpha .eq. 0.0_rk4))  then
    2566           0 :         tau(k-sidx+1) = 0.0_rk4
    2567           0 :         seedC(k,k-sidx+1) = 0.0_rk4
    2568             : #endif
    2569             : 
    2570             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
    2571             :             &PRECISION&
    2572           0 :             &")
    2573           0 :         return
    2574             :       end if
    2575             : 
    2576           0 :       tau(k-sidx+1) = (beta+alpha) / beta
    2577             : 
    2578             :       ! ---------------------------------------
    2579             :       ! calculate c Vector (extra Vector or encode in seedC/seedD?
    2580           0 :       work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
    2581             : #ifdef DOUBLE_PRECISION_REAL
    2582             :       call dgemv("Trans", buffersize+1, buffersize, &
    2583             :                  1.0_rk8,seedC(1,1),k,seedC(1,k-sidx+1),1, &
    2584           0 :                  1.0_rk8,work(coffset),1)
    2585             : 
    2586             :       ! calculate z using tau,seedD,seedC and c Vector
    2587           0 :       work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
    2588           0 :       call daxpy(buffersize, 1.0_rk8/beta, work(coffset), 1, work(zoffset), 1)
    2589             : 
    2590             :       ! update A1(local copy) and generate part of householder vectors for use
    2591           0 :       call daxpy(buffersize, -1.0_rk8, work(zoffset),1,seedC(k-sidx+1,1),k)
    2592           0 :       call dscal(buffersize, 1.0_rk8/(alpha+beta), seedC(1,k-sidx+1),1)
    2593           0 :       call dger(buffersize, buffersize, -1.0_rk8, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
    2594             : 
    2595             :       ! update A global (householder Vector already generated by pdlarfgk)
    2596           0 :       mpirank_top = MOD(lidx/nb,mpiprocs)
    2597           0 :       if (mpirank .eq. mpirank_top) then
    2598             :         ! handle first row separately
    2599           0 :         topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
    2600           0 :         call daxpy(buffersize,-1.0_rk8,work(zoffset),1,a(topidx,1),lda)
    2601             :       end if
    2602             : 
    2603             :       call dger(localsize, buffersize,-1.0_rk8, &
    2604             :                 a(localoffset,k-sidx+1),1,work(zoffset),1, &
    2605           0 :                 a(localoffset,1),lda)
    2606             : 
    2607             :       ! update D (symmetric) => two buffer vectors of size rank
    2608             :       ! generate y Vector
    2609           0 :       work(yoffset:yoffset+buffersize-1) = 0._rk8
    2610           0 :       call daxpy(buffersize,1.0_rk8/(alpha+beta),work(zoffset),1,work(yoffset),1)
    2611             : 
    2612             :       ! generate v Vector
    2613           0 :       work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
    2614           0 :       call daxpy(buffersize, -0.5_rk8*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
    2615             : 
    2616             :       ! symmetric update of D using y and v
    2617             :       call dsyr2("Upper", buffersize,-1.0_rk8, &
    2618             :                      work(yoffset),1,work(voffset),1, &
    2619           0 :                      seedD(1,1), k)
    2620             : 
    2621             :       ! prepare T matrix inner products
    2622             :       ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
    2623             :       ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
    2624           0 :       call dger(k-sidx,sidx,-1.0_rk8,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
    2625           0 :       seedC(k,k-sidx+1) = 1.0_rk8/(alpha+beta)
    2626             : #else /* DOUBLE_PRECISION_REAL */
    2627             :       call sgemv("Trans", buffersize+1, buffersize, &
    2628             :                  1.0_rk4,seedC(1,1),k,seedC(1,k-sidx+1),1, &
    2629           0 :                  1.0_rk4,work(coffset),1)
    2630             : 
    2631             :       ! calculate z using tau,seedD,seedC and c Vector
    2632           0 :       work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
    2633           0 :       call saxpy(buffersize, 1.0_rk4/beta, work(coffset), 1, work(zoffset), 1)
    2634             : 
    2635             :       ! update A1(local copy) and generate part of householder vectors for use
    2636           0 :       call saxpy(buffersize, -1.0_rk4, work(zoffset),1,seedC(k-sidx+1,1),k)
    2637           0 :       call sscal(buffersize, 1.0_rk4/(alpha+beta), seedC(1,k-sidx+1),1)
    2638           0 :       call sger(buffersize, buffersize, -1.0_rk4, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
    2639             : 
    2640             :       ! update A global (householder Vector already generated by pdlarfgk)
    2641           0 :       mpirank_top = MOD(lidx/nb,mpiprocs)
    2642           0 :       if (mpirank .eq. mpirank_top) then
    2643             :         ! handle first row separately
    2644           0 :         topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
    2645           0 :         call saxpy(buffersize,-1.0_rk4,work(zoffset),1,a(topidx,1),lda)
    2646             :       end if
    2647             : 
    2648             :       call sger(localsize, buffersize,-1.0_rk4, &
    2649             :                 a(localoffset,k-sidx+1),1,work(zoffset),1, &
    2650           0 :                 a(localoffset,1),lda)
    2651             : 
    2652             :       ! update D (symmetric) => two buffer vectors of size rank
    2653             :       ! generate y Vector
    2654           0 :       work(yoffset:yoffset+buffersize-1) = 0._rk4
    2655           0 :       call saxpy(buffersize,1.0_rk4/(alpha+beta),work(zoffset),1,work(yoffset),1)
    2656             : 
    2657             :       ! generate v Vector
    2658           0 :       work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
    2659           0 :       call saxpy(buffersize, -0.5_rk4*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
    2660             : 
    2661             :       ! symmetric update of D using y and v
    2662             :       call ssyr2("Upper", buffersize,-1.0_rk4, &
    2663             :                      work(yoffset),1,work(voffset),1, &
    2664           0 :                      seedD(1,1), k)
    2665             : 
    2666             :       ! prepare T matrix inner products
    2667             :       ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
    2668             :       ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
    2669           0 :       call sger(k-sidx,sidx,-1.0_rk4,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
    2670           0 :       seedC(k,k-sidx+1) = 1.0_rk4/(alpha+beta)
    2671             : #endif /* DOUBLE_PRECISION_REAL */
    2672             : 
    2673             :         call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
    2674             :             &PRECISION&
    2675           0 :             &")
    2676             :     end subroutine
    2677             : 
    2678             :     subroutine qr_pdlarfgk_1dcomm_generateT_&
    2679           0 :           &PRECISION &
    2680           0 :           (obj,seedC,seedD,k,actualk,tau,t,ldt)
    2681             :       use precision
    2682             :       use elpa_abstract_impl
    2683             :       implicit none
    2684             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2685             :       integer(kind=ik)  :: k,actualk,ldt
    2686             :       real(kind=C_DATATYPE_KIND)     :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*)
    2687             : 
    2688             :       integer(kind=ik)  :: irow,icol
    2689             :       real(kind=C_DATATYPE_KIND)     :: column_coefficient
    2690             :         call obj%timer%start("qr_pdlarfgk_1dcomm_generateT_&
    2691             :             &PRECISION&
    2692           0 :             &")
    2693             : 
    2694             :       !print *,'reversed on the fly T generation NYI'
    2695             : 
    2696           0 :       do icol=1,actualk-1
    2697             :         ! calculate inner product of householder Vector parts in seedC
    2698             :         ! (actually calculating more than necessary, if actualk < k)
    2699             :         ! => a lot of junk from row 1 to row k-actualk
    2700             : #ifdef DOUBLE_PRECISION_REAL
    2701           0 :         call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
    2702             : #else
    2703           0 :         call strmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
    2704             : #endif
    2705             :         ! add scaled D parts to current column of C (will become later T rows)
    2706           0 :         column_coefficient = seedC(k,k-icol+1)
    2707           0 :         do irow=k-actualk+1,k-1
    2708           0 :           seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) +  ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) )
    2709             :         end do
    2710             :       end do
    2711             : 
    2712             :       call qr_dlarft_kernel_&
    2713             :              &PRECISION &
    2714           0 :              (actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt)
    2715             :       call obj%timer%stop("qr_pdlarfgk_1dcomm_generateT_&
    2716             :              &PRECISION&
    2717           0 :              &")
    2718             : 
    2719           0 :     end subroutine
    2720             : 
    2721             :     !direction=0: pack into work buffer
    2722             :     !direction=1: unpack from work buffer
    2723             :     subroutine qr_pdgeqrf_pack_unpack_&
    2724           0 : &PRECISION &
    2725           0 : (obj,v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm)
    2726             :       use precision
    2727             :       use elpa1_impl
    2728             :       use qr_utils_mod
    2729             :       use elpa_abstract_impl
    2730             :       implicit none
    2731             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2732             :       ! input variables (local)
    2733             :       integer(kind=ik)   :: ldv,lwork
    2734             :       real(kind=C_DATATYPE_KIND)      :: v(ldv,*), work(*)
    2735             : 
    2736             :       ! input variables (global)
    2737             :       integer(kind=ik)   :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm
    2738             : 
    2739             :       ! output variables (global)
    2740             : 
    2741             :       ! local scalars
    2742             :       integer(kind=ik)   :: mpierr,mpirank,mpiprocs
    2743             :       integer(kind=ik)   :: buffersize,icol
    2744             :       integer(kind=ik)   :: local_size,baseoffset,offset
    2745             : 
    2746             :       ! external functions
    2747             :         call obj%timer%start("qr_pdgeqrf_pack_unpack_&
    2748             :             &PRECISION&
    2749           0 :             &")
    2750           0 :       call mpi_comm_rank(mpicomm,mpirank,mpierr)
    2751           0 :       call mpi_comm_size(mpicomm,mpiprocs,mpierr)
    2752             :       call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
    2753           0 :                                     local_size,baseoffset,offset)
    2754             : 
    2755             :       !print *,'pack/unpack',local_size,baseoffset,offset
    2756             : 
    2757             :       ! rough approximate for buffer size
    2758           0 :       if (lwork .eq. -1) then
    2759           0 :         buffersize = local_size * n ! Vector elements
    2760           0 :         work(1) = DBLE(buffersize)
    2761             :         call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
    2762             :             &PRECISION&
    2763           0 :             &")
    2764             : 
    2765           0 :         return
    2766             :       end if
    2767             : 
    2768           0 :       if (direction .eq. 0) then
    2769             :         ! copy v part to buffer (including zeros)
    2770           0 :         do icol=1,n
    2771           0 :           work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol)
    2772             :         end do
    2773             :       else
    2774             :         ! copy v part from buffer (including zeros)
    2775           0 :         do icol=1,n
    2776           0 :           v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol)
    2777             :         end do
    2778             :       end if
    2779             :         call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
    2780             :             &PRECISION&
    2781           0 :             &")
    2782             : 
    2783           0 :       return
    2784             : 
    2785             :     end subroutine
    2786             : 
    2787             :     !direction=0: pack into work buffer
    2788             :     !direction=1: unpack from work buffer
    2789             :     subroutine qr_pdgeqrf_pack_unpack_tmatrix_&
    2790           0 :           &PRECISION &
    2791           0 :           (obj,tau,t,ldt,work,lwork,n,direction)
    2792             :       use precision
    2793             :       use elpa1_impl
    2794             :       use qr_utils_mod
    2795             :       use elpa_abstract_impl
    2796             :       implicit none
    2797             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2798             :       ! input variables (local)
    2799             :       integer(kind=ik)  :: ldt,lwork
    2800             :       real(kind=C_DATATYPE_KIND)     :: work(*), t(ldt,*),tau(*)
    2801             : 
    2802             :       ! input variables (global)
    2803             :       integer(kind=ik)  :: n,direction
    2804             : 
    2805             :       ! output variables (global)
    2806             : 
    2807             :       ! local scalars
    2808             :       integer(kind=ik)  :: icol
    2809             : 
    2810             :       ! external functions
    2811             :         call obj%timer%start("qr_pdgeqrf_pack_unpack_tmatrix_&
    2812             :             &PRECISION&
    2813           0 :             &")
    2814             : 
    2815           0 :       if (lwork .eq. -1) then
    2816             : #ifdef DOUBLE_PRECISION_REAL
    2817           0 :         work(1) = real(n*n,kind=C_DATATYPE_KIND)
    2818             : #else
    2819           0 :         work(1) = real(n*n,kind=rk4)
    2820             : #endif
    2821             : 
    2822             :         call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
    2823             :             &PRECISION&
    2824           0 :             &")
    2825           0 :         return
    2826             :       end if
    2827             : 
    2828           0 :       if (direction .eq. 0) then
    2829             :         ! append t matrix to buffer (including zeros)
    2830           0 :         do icol=1,n
    2831           0 :           work(1+(icol-1)*n:icol*n) = t(1:n,icol)
    2832             :         end do
    2833             :       else
    2834             :         ! append t matrix from buffer (including zeros)
    2835           0 :         do icol=1,n
    2836           0 :           t(1:n,icol) = work(1+(icol-1)*n:icol*n)
    2837           0 :           tau(icol) = t(icol,icol)
    2838             :         end do
    2839             :       end if
    2840             :         call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
    2841             :             &PRECISION&
    2842           0 :             &")
    2843             :     end subroutine
    2844             : 
    2845             : 
    2846             : #ifndef ALREADY_DEFINED
    2847             :     ! TODO: encode following functionality
    2848             :     !   - Direction? BOTTOM UP or TOP DOWN ("Up", "Down")
    2849             :     !        => influences all related kernels (including DLARFT / DLARFB)
    2850             :     !   - rank-k parameter (k=1,2,...,b)
    2851             :     !        => influences possible update strategies
    2852             :     !        => parameterize the function itself? (FUNCPTR, FUNCARG)
    2853             :     !   - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels)
    2854             :     !   - subblocking
    2855             :     !         (maximum block size bounded by data distribution along rows)
    2856             :     !   - blocking method (householder vectors only or compact WY?)
    2857             :     !   - update strategy of trailing parts (incremental, complete)
    2858             :     !        - difference for subblocks and normal blocks? (UPDATE and UPDATESUB)
    2859             :     !        o "Incremental"
    2860             :     !        o "Full"
    2861             :     !   - final T generation (recursive: subblock wise, block wise, end) (TMERGE)
    2862             :     !        ' (implicitly given by / influences update strategies?)
    2863             :     !        => alternative: during update: iterate over sub t parts
    2864             :     !           => advantage: smaller (cache aware T parts)
    2865             :     !           => disadvantage: more memory write backs
    2866             :     !                (number of T parts * matrix elements)
    2867             :     !   - partial/sub T generation (TGEN)
    2868             :     !        o add vectors right after creation (Vector)
    2869             :     !        o add set of vectors (Set)
    2870             :     !   - bcast strategy of householder vectors to other process columns
    2871             :     !        (influences T matrix generation and trailing update
    2872             :     !         in other process columns)
    2873             :     !        o no broadcast (NONE = benchmarking?,
    2874             :     !            or not needed due to 1D process grid)
    2875             :     !        o after every housegen (VECTOR)
    2876             :     !        o after every subblk   (SUBBLOCK)
    2877             :     !        o after full local column block decomposition (BLOCK)
    2878             :     !  LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft
    2879             : 
    2880             :     !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, &
    2881             :     !                             SUBBLK, UPDATE, TGEN, BCAST)
    2882             :     ! gmode: control communication pattern of dlarfg
    2883             :     ! maxrank: control max number of householder vectors per communication
    2884             :     ! eps: error threshold (integer)
    2885             :     ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge')
    2886             :     !               merging = full update with tmatrix merging
    2887             :     ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge
    2888             :     !               only matters if update* == full
    2889           0 :     subroutine qr_pqrparam_init(obj,pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode)
    2890             :       use precision
    2891             :       use elpa_abstract_impl
    2892             :       implicit none
    2893             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2894             :       ! input
    2895             :       CHARACTER         :: update2d,update1d,update,hgmode
    2896             :       INTEGER(kind=ik)  :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d
    2897             : 
    2898             :       ! output
    2899             : #ifdef USE_ASSUMED_SIZE_QR
    2900             :       INTEGER(kind=ik)  :: PQRPARAM(*)
    2901             : #else
    2902             :       INTEGER(kind=ik)  :: PQRPARAM(1:11)
    2903             : #endif
    2904             : 
    2905           0 :         call obj%timer%start("qr_pqrparam_init")
    2906             : 
    2907           0 :       PQRPARAM(1) = size2d
    2908           0 :       PQRPARAM(2) = ichar(update2d)
    2909           0 :       PQRPARAM(3) = tmerge2d
    2910             :       ! TODO: broadcast T yes/no
    2911             : 
    2912           0 :       PQRPARAM(4) = size1d
    2913           0 :       PQRPARAM(5) = ichar(update1d)
    2914           0 :       PQRPARAM(6) = tmerge1d
    2915             : 
    2916           0 :       PQRPARAM(7) = maxrank
    2917           0 :       PQRPARAM(8) = ichar(update)
    2918           0 :       PQRPARAM(9) = eps
    2919           0 :       PQRPARAM(10) = ichar(hgmode)
    2920           0 :         call obj%timer%stop("qr_pqrparam_init")
    2921             : 
    2922           0 :     end subroutine qr_pqrparam_init
    2923             : #endif /* ALREADY_DEFINED */
    2924             : 
    2925             :     subroutine qr_pdlarfg_copy_1dcomm_&
    2926           0 : &PRECISION &
    2927             : (obj,x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm)
    2928             :       use precision
    2929             :       use elpa1_impl
    2930             :       use qr_utils_mod
    2931             :       use elpa_abstract_impl
    2932             :       implicit none
    2933             :       class(elpa_abstract_impl_t), intent(inout) :: obj
    2934             :       ! input variables (local)
    2935             :       integer(kind=ik)  :: incx,incv
    2936             :       real(kind=C_DATATYPE_KIND)     :: x(*), v(*)
    2937             : 
    2938             :       ! input variables (global)
    2939             :       integer(kind=ik)  :: baseidx,idx,rev,nb,n
    2940             :       integer(kind=ik)  :: mpicomm
    2941             : 
    2942             :       ! output variables (global)
    2943             : 
    2944             :       ! local scalars
    2945             :       integer(kind=ik)  :: mpierr,mpiprocs
    2946             :       integer(kind=ik)  :: mpirank,mpirank_top
    2947             :       integer(kind=ik)  :: irow,x_offset
    2948             :       integer(kind=ik)  :: v_offset,local_size
    2949             : 
    2950             :         call obj%timer%start("qr_pdlarfg_copy_1dcomm_&
    2951             :             &PRECISION&
    2952           0 :             &")
    2953           0 :       call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    2954           0 :       call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
    2955             :       call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
    2956           0 :                                 local_size,v_offset,x_offset)
    2957           0 :       v_offset = v_offset * incv
    2958             : 
    2959             :       !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size
    2960             : 
    2961             :       ! copy elements
    2962           0 :       do irow=1,local_size
    2963           0 :         v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset)
    2964             :       end do
    2965             : 
    2966             :       ! replace top element to build an unitary Vector
    2967           0 :       mpirank_top = MOD((idx-1)/nb,mpiprocs)
    2968           0 :       if (mpirank .eq. mpirank_top) then
    2969             : #ifdef DOUBLE_PRECISION_REAL
    2970           0 :         v(local_size*incv) = 1.0_rk8
    2971             : #else
    2972           0 :         v(local_size*incv) = 1.0_rk4
    2973             : #endif
    2974             :       end if
    2975             :         call obj%timer%stop("qr_pdlarfg_copy_1dcomm_&
    2976             :             &PRECISION&
    2977           0 :             &")
    2978             : 
    2979           0 :     end subroutine
    2980             : 
    2981             : ! vim: syntax=fortran

Generated by: LCOV version 1.12