LCOV - coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info - src/elpa2/elpa2_tridiag_band

LCOV - code coverage report

Current view:	top level - src/elpa2 - elpa2_tridiag_band_template.F90 (source / functions)		Hit	Total	Coverage
Test:	coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info	Lines:	298	438	68.0 %
Date:	2018-01-10 09:29:53	Functions:	4	10	40.0 %

          Line data    Source code

       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : ! Copyright of the original code rests with the authors inside the ELPA
      47             : ! consortium. The copyright of any additional modifications shall rest
      48             : ! with their original authors, but shall adhere to the licensing terms
      49             : ! distributed along with the original code in the file "COPYING".
      50             : #endif
      51             : 
      52             : #include "../general/sanity.F90"
      53             : 
      54             : #if REALCASE == 1
      55             :     subroutine tridiag_band_real_&
      56             : #endif
      57             : #if COMPLEXCASE == 1
      58             :     subroutine tridiag_band_complex_&
      59             : #endif
      60       47448 :     &PRECISION &
      61       47448 :     (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
      62             :     hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
      63             :     !-------------------------------------------------------------------------------
      64             :     ! tridiag_band_real/complex:
      65             :     ! Reduces a real symmetric band matrix to tridiagonal form
      66             :     !
      67             :     !  na          Order of matrix a
      68             :     !
      69             :     !  nb          Semi bandwith
      70             :     !
      71             :     !  nblk        blocksize of cyclic distribution, must be the same in both directions!
      72             :     !
      73             :     !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
      74             :     !
      75             :     !  lda         Leading dimension of a
      76             :     !  matrixCols  local columns of matrix a
      77             :     !
      78             :     ! hh_trans : housholder vectors
      79             :     !
      80             :     !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
      81             :     !
      82             :     !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
      83             :     !
      84             :     !  mpi_comm_rows
      85             :     !  mpi_comm_cols
      86             :     !              MPI-Communicators for rows/columns
      87             :     !  communicator
      88             :     !              MPI-Communicator for the total processor set
      89             :     !-------------------------------------------------------------------------------
      90             :       use elpa_abstract_impl
      91             :       use elpa2_workload
      92             :       use precision
      93             :       use iso_c_binding
      94             :       use redist
      95             :       implicit none
      96             : #include "../general/precision_kinds.F90"
      97             :       class(elpa_abstract_impl_t), intent(inout)   :: obj
      98             :       logical, intent(in)                          :: useGPU, wantDebug
      99             :       integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
     100             : #ifdef USE_ASSUMED_SIZE
     101             :       MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,*)
     102             : #else
     103             :       MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,matrixCols)
     104             : #endif
     105             :       integer(kind=c_intptr_t)                     :: a_dev
     106             :       real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
     107             :       MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
     108             : 
     109             :       real(kind=rk)                     :: vnorm2
     110      284688 :       MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
     111      189792 :       MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)
     112             : 
     113             :       integer(kind=ik)                             :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
     114             :       integer(kind=ik)                             :: my_pe, n_pes, mpierr
     115             :       integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
     116             :       integer(kind=ik)                             :: ireq_ab, ireq_hv
     117             :       integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
     118             : #ifdef WITH_OPENMP
     119             :       integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
     120             : #ifdef WITH_MPI
     121             : !      integer(kind=ik)                            :: my_mpi_status(MPI_STATUS_SIZE)
     122             : #endif
     123             : !      integer(kind=ik), allocatable               :: mpi_statuses(:,:), global_id_tmp(:,:)
     124       23724 :       integer(kind=ik), allocatable                :: global_id_tmp(:,:)
     125       23724 :       integer(kind=ik), allocatable                :: omp_block_limits(:)
     126       47448 :       MATH_DATATYPE(kind=rck), allocatable        :: hv_t(:,:), tau_t(:)
     127             :       integer(kind=ik)                             :: omp_get_max_threads
     128             : #endif /* WITH_OPENMP */
     129      142344 :       integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
     130       71172 :       integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
     131       47448 :       integer(kind=ik), allocatable                :: block_limits(:)
     132       94896 :       MATH_DATATYPE(kind=rck), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
     133             :       integer                                      :: istat
     134             :       character(200)                               :: errorMessage
     135             : 
     136             : #ifndef WITH_MPI
     137             :       integer(kind=ik)                             :: startAddr
     138             : #endif
     139             :       call obj%timer%start("tridiag_band_&
     140             :       &MATH_DATATYPE&
     141             :       &" // &
     142             :       &PRECISION_SUFFIX &
     143       47448 :       )
     144       47448 :       if (wantDebug) call obj%timer%start("mpi_communication")
     145       47448 :       call mpi_comm_rank(communicator,my_pe,mpierr)
     146       47448 :       call mpi_comm_size(communicator,n_pes,mpierr)
     147             : 
     148       47448 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     149       47448 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     150       47448 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     151       47448 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     152       47448 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     153             : 
     154             :       ! Get global_id mapping 2D procssor coordinates to global id
     155             : 
     156       47448 :       allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
     157       47448 :       if (istat .ne. 0) then
     158             :         print *,"tridiag_band_&
     159             :                  &MATH_DATATYPE&
     160           0 :                  &: error when allocating global_id "//errorMessage
     161           0 :         stop 1
     162             :       endif
     163             : 
     164       47448 :       global_id(:,:) = 0
     165       47448 :       global_id(my_prow, my_pcol) = my_pe
     166             : 
     167             : #ifdef WITH_OPENMP
     168       23724 :       allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
     169       23724 :       if (istat .ne. 0) then
     170             :         print *,"tridiag_band_&
     171             :                 &MATH_DATATYPE&
     172           0 :                 &: error when allocating global_id_tmp "//errorMessage
     173           0 :         stop 1
     174             :       endif
     175             : #endif
     176             : 
     177             : #ifdef WITH_MPI
     178       31632 :       if (wantDebug) call obj%timer%start("mpi_communication")
     179             : #ifndef WITH_OPENMP
     180       15816 :       call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
     181             : #else
     182       15816 :       global_id_tmp(:,:) = global_id(:,:)
     183       15816 :       call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
     184       15816 :       deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
     185       15816 :       if (istat .ne. 0) then
     186             :         print *,"tridiag_band_&
     187             :                  &MATH_DATATYPE&
     188           0 :                  &: error when deallocating global_id_tmp "//errorMessage
     189           0 :         stop 1
     190             :       endif
     191             : #endif /* WITH_OPENMP */
     192       31632 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     193             : #endif /* WITH_MPI */
     194             : 
     195             :       ! Total number of blocks in the band:
     196             : 
     197       47448 :       nblocks_total = (na-1)/nb + 1
     198             : 
     199             :       ! Set work distribution
     200             : 
     201       47448 :       allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
     202       47448 :       if (istat .ne. 0) then
     203             :         print *,"tridiag_band_&
     204             :                  &MATH_DATATYPE&
     205           0 :                  &: error when allocating block_limits"//errorMessage
     206           0 :         stop 1
     207             :       endif
     208             : 
     209       47448 :       call divide_band(obj,nblocks_total, n_pes, block_limits)
     210             : 
     211             :       ! nblocks: the number of blocks for my task
     212       47448 :       nblocks = block_limits(my_pe+1) - block_limits(my_pe)
     213             : 
     214             :       ! allocate the part of the band matrix which is needed by this PE
     215             :       ! The size is 1 block larger than needed to avoid extensive shifts
     216       47448 :       allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
     217       47448 :       if (istat .ne. 0) then
     218             :         print *,"tridiag_band_&
     219             :                  &MATH_DATATYPE&
     220           0 :                  &: error when allocating ab"//errorMessage
     221           0 :         stop 1
     222             :       endif
     223             : 
     224       47448 :       ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
     225             : 
     226             :       ! n_off: Offset of ab within band
     227       47448 :       n_off = block_limits(my_pe)*nb
     228             : 
     229             :       ! Redistribute band in a to ab
     230             :       call redist_band_&
     231             :       &MATH_DATATYPE&
     232             :       &_&
     233             :       &PRECISION&
     234       47448 :       &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
     235             : 
     236             :       ! Calculate the workload for each sweep in the back transformation
     237             :       ! and the space requirements to hold the HH vectors
     238             : 
     239       47448 :       allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
     240       47448 :       if (istat .ne. 0) then
     241             :         print *,"tridiag_band_&
     242             :                  &MATH_DATATYPE&
     243           0 :                  &: error when allocating limits"//errorMessage
     244           0 :         stop 1
     245             :       endif
     246             : 
     247       47448 :       call determine_workload(obj,na, nb, np_rows, limits)
     248       47448 :       max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
     249             : 
     250       47448 :       num_hh_vecs = 0
     251       47448 :       num_chunks  = 0
     252       47448 :       nx = na
     253      278784 :       do n = 1, nblocks_total
     254      231336 :         call determine_workload(obj, nx, nb, np_rows, limits)
     255      231336 :         local_size = limits(my_prow+1) - limits(my_prow)
     256             :         ! add to number of householder vectors
     257             :         ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
     258      231336 :         if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
     259      215520 :           num_hh_vecs = num_hh_vecs + local_size
     260      215520 :           num_chunks  = num_chunks+1
     261             :         endif
     262      231336 :         nx = nx - nb
     263             :       enddo
     264             : 
     265             :       ! Allocate space for HH vectors
     266             : 
     267       47448 :       allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
     268       47448 :       if (istat .ne. 0) then
     269             : #if REALCASE == 1
     270           0 :         print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
     271             : #endif
     272             : #if COMPLEXCASE == 1
     273           0 :         print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
     274             : #endif
     275           0 :         stop 1
     276             :       endif
     277             : 
     278             :       ! Allocate and init MPI requests
     279             : 
     280       47448 :       allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
     281       47448 :       if (istat .ne. 0) then
     282             :         print *,"tridiag_band_&
     283             :                  &MATH_DATATYPE&
     284           0 :                  &: error when allocating ireq_hhr"//errorMessage
     285           0 :         stop 1
     286             :       endif
     287       47448 :       allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
     288       47448 :       if (istat .ne. 0) then
     289             :         print *,"tridiag_band_&
     290             :                  &MATH_DATATYEP&
     291           0 :                  &: error when allocating ireq_hhs"//errorMessage
     292           0 :         stop 1
     293             :       endif
     294             : 
     295       47448 :       num_hh_vecs = 0
     296       47448 :       num_chunks  = 0
     297       47448 :       nx = na
     298       47448 :       nt = 0
     299      278784 :       do n = 1, nblocks_total
     300      231336 :         call determine_workload(obj,nx, nb, np_rows, limits)
     301      231336 :         local_size = limits(my_prow+1) - limits(my_prow)
     302      231336 :         if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
     303      215520 :           num_chunks  = num_chunks+1
     304             : #ifdef WITH_MPI
     305      138408 :           if (wantDebug) call obj%timer%start("mpi_communication")
     306             :           call mpi_irecv(hh_trans(1,num_hh_vecs+1),          &
     307             :                    nb*local_size,                           &
     308             : #if REALCASE == 1
     309             :        MPI_REAL_PRECISION,                     &
     310             : #endif
     311             : #if COMPLEXCASE == 1
     312             :                         MPI_COMPLEX_EXPLICIT_PRECISION,           &
     313             : #endif
     314      138408 :                         nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
     315      138408 :           if (wantDebug) call obj%timer%stop("mpi_communication")
     316             : 
     317             : #else /* WITH_MPI */
     318             :           ! carefull non-block recv data copy must be done at wait or send
     319             :           ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
     320             : 
     321             : #endif /* WITH_MPI */
     322      215520 :           num_hh_vecs = num_hh_vecs + local_size
     323             :         endif
     324      231336 :         nx = nx - nb
     325      231336 :         if (n == block_limits(nt+1)) then
     326       79080 :           nt = nt + 1
     327             :         endif
     328             :       enddo
     329             : #ifdef WITH_MPI
     330       31632 :       ireq_hhs(:) = MPI_REQUEST_NULL
     331             : #endif
     332             :       ! Buffers for gathering/sending the HH vectors
     333             : 
     334       47448 :       allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
     335       47448 :       if (istat .ne. 0) then
     336             :         print *,"tridiag_band_&
     337             :                  &MATH_DATATYPE&
     338           0 :                  &: error when allocating hh_gath"//errorMessage
     339           0 :         stop 1
     340             :       endif
     341             : 
     342       47448 :       allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
     343       47448 :       if (istat .ne. 0) then
     344             :         print *,"tridiag_band_&
     345             :                 &MATH_DATATYPE&
     346           0 :                 &: error when allocating hh_send"//errorMessage
     347           0 :         stop 1
     348             :       endif
     349             : 
     350       47448 :       hh_gath(:,:,:) = 0.0_rck
     351       47448 :       hh_send(:,:,:) = 0.0_rck
     352             : 
     353             :       ! Some counters
     354             : 
     355       47448 :       allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
     356       47448 :       if (istat .ne. 0) then
     357             :         print *,"tridiag_band_&
     358             :                 &MATH_DATATYPE&
     359           0 :                 &: error when allocating hh_cnt"//errorMessage
     360           0 :         stop 1
     361             :       endif
     362             : 
     363       47448 :       allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
     364       47448 :       if (istat .ne. 0) then
     365             :         print *,"tridiag_band_&
     366             :                 &MATH_DATATYPE&
     367           0 :                 &: error when allocating hh_dst"//errorMessage
     368           0 :         stop 1
     369             :       endif
     370             : 
     371       47448 :       hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
     372       47448 :       hh_dst(:) = 0 ! PE number for receive
     373             : #ifdef WITH_MPI
     374       31632 :       ireq_ab = MPI_REQUEST_NULL
     375       31632 :       ireq_hv = MPI_REQUEST_NULL
     376             : #endif
     377             :       ! Limits for sending
     378             : 
     379       47448 :       allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
     380       47448 :       if (istat .ne. 0) then
     381             :         print *,"tridiag_band_&
     382             :                 &MATH_DATATYPE&
     383           0 :                 &: error when allocating snd_limits"//errorMessage
     384           0 :         stop 1
     385             :       endif
     386      201672 :       do iblk=1,nblocks
     387      154224 :         call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
     388             :       enddo
     389             : 
     390             : #ifdef WITH_OPENMP
     391             :       ! OpenMP work distribution:
     392             : 
     393       23724 :       max_threads = 1
     394             : #if REALCASE == 1
     395       14292 :       max_threads = omp_get_max_threads()
     396             : #endif
     397             : #if COMPLEXCASE == 1
     398        9432 : !$      max_threads = omp_get_max_threads()
     399             : #endif
     400             :       ! For OpenMP we need at least 2 blocks for every thread
     401       23724 :       max_threads = MIN(max_threads, nblocks/2)
     402       23724 :       if (max_threads==0) max_threads = 1
     403             : 
     404       23724 :       allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
     405       23724 :       if (istat .ne. 0) then
     406             :         print *,"tridiag_band_&
     407             :                  &MATH_DATATYPE&
     408           0 :                  &: error when allocating omp_block_limits"//errorMessage
     409           0 :         stop 1
     410             :       endif
     411             : 
     412             :       ! Get the OpenMP block limits
     413       23724 :       call divide_band(obj,nblocks, max_threads, omp_block_limits)
     414             : 
     415       23724 :       allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
     416       23724 :       if (istat .ne. 0) then
     417             :         print *,"tridiag_band_&
     418             :                 &MATH_DATATYPE&
     419           0 :                 &: error when allocating hv_t, tau_t"//errorMessage
     420           0 :         stop 1
     421             :       endif
     422             : 
     423       23724 :       hv_t = 0.0_rck
     424       23724 :       tau_t = 0.0_rck
     425             : #endif /* WITH_OPENMP */
     426             : 
     427             :       ! ---------------------------------------------------------------------------
     428             :       ! Start of calculations
     429             : 
     430       47448 :       na_s = block_limits(my_pe)*nb + 1
     431             : 
     432       47448 :       if (my_pe>0 .and. na_s<=na) then
     433             :         ! send first column to previous PE
     434             :         ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
     435       15816 :         ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
     436             : #ifdef WITH_MPI
     437       15816 :         if (wantDebug) call obj%timer%start("mpi_communication")
     438             :         call mpi_isend(ab_s, nb+1,  &
     439             : #if REALCASE == 1
     440             :                  MPI_REAL_PRECISION,  &
     441             : #endif
     442             : #if COMPLEXCASE == 1
     443             :                        MPI_COMPLEX_EXPLICIT_PRECISION, &
     444             : #endif
     445       15816 :            my_pe-1, 1, communicator, ireq_ab, mpierr)
     446       15816 :         if (wantDebug) call obj%timer%stop("mpi_communication")
     447             : #endif /* WITH_MPI */
     448             :       endif
     449             : 
     450             : #ifndef WITH_MPI
     451       15816 :           startAddr = ubound(hh_trans,dim=2)
     452             : #endif /* WITH_MPI */
     453             : 
     454             : #ifdef WITH_OPENMP
     455     3857352 :       do istep=1,na-1-block_limits(my_pe)*nb
     456             : #else
     457     4537800 :       do istep=1,na-1
     458             : #endif
     459             : 
     460     8347704 :         if (my_pe==0) then
     461     6018768 :           n = MIN(na-na_s,nb) ! number of rows to be reduced
     462     6018768 :           hv(:) = 0.0_rck
     463     6018768 :           tau = 0.0_rck
     464             : 
     465             :           ! Transform first column of remaining matrix
     466             : #if REALCASE == 1
     467             :           ! The last step (istep=na-1) is only needed for sending the last HH vectors.
     468             :           ! We don't want the sign of the last element flipped (analogous to the other sweeps)
     469             : #endif
     470             : #if COMPLEXCASE == 1
     471             :          ! Opposed to the real case, the last step (istep=na-1) is needed here for making
     472             :          ! the last subdiagonal element a real number
     473             : #endif
     474             : 
     475             : #if REALCASE == 1
     476     3492144 :           if (istep < na-1) then
     477             :             ! Transform first column of remaining matrix
     478     3473088 :             vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
     479             : #endif
     480             : #if COMPLEXCASE == 1
     481             : #ifdef DOUBLE_PRECISION_COMPLEX
     482     1748352 :           vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
     483             : #else
     484      778272 :           vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
     485             : #endif
     486     2526624 :           if (n<2) vnorm2 = 0. ! Safety only
     487             : #endif /* COMPLEXCASE */
     488             : 
     489             : #if REALCASE == 1
     490             :             call hh_transform_real_&
     491             : #endif
     492             : #if COMPLEXCASE == 1
     493             :             call hh_transform_complex_&
     494             : #endif
     495             : &PRECISION &
     496     5999712 :                                (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
     497             : 
     498     5999712 :             hv(1) = 1.0_rck
     499     5999712 :             hv(2:n) = ab(3:n+1,na_s-n_off)*hf
     500             : #if REALCASE == 1
     501             :           endif
     502             : #endif
     503             : 
     504             : #if REALCASE == 1
     505     3492144 :           d(istep) = ab(1,na_s-n_off)
     506     3492144 :           e(istep) = ab(2,na_s-n_off)
     507             : #endif
     508             : #if COMPLEXCASE == 1
     509     2526624 :           d(istep) = real(ab(1,na_s-n_off), kind=rk)
     510     2526624 :           e(istep) = real(ab(2,na_s-n_off), kind=rk)
     511             : #endif
     512             : 
     513     6018768 :           if (istep == na-1) then
     514             : #if REALCASE == 1
     515       19056 :             d(na) = ab(1,na_s+1-n_off)
     516             : #endif
     517             : #if COMPLEXCASE == 1
     518       12576 :             d(na) = real(ab(1,na_s+1-n_off),kind=rk)
     519             : #endif
     520       31632 :             e(na) = 0.0_rck
     521             :           endif
     522             :         else
     523     2328936 :           if (na>na_s) then
     524             :             ! Receive Householder Vector from previous task, from PE owning subdiagonal
     525             : 
     526             : #ifdef WITH_OPENMP
     527             : 
     528             : #ifdef WITH_MPI
     529      824244 :             if (wantDebug) call obj%timer%start("mpi_communication")
     530             :             call mpi_recv(hv, nb,       &
     531             : #if REALCASE == 1
     532             :                           MPI_REAL_PRECISION, &
     533             : #endif
     534             : #if COMPLEXCASE == 1
     535             :                           MPI_COMPLEX_EXPLICIT_PRECISION, &
     536             : #endif
     537      824244 :                           my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
     538      824244 :             if (wantDebug) call obj%timer%stop("mpi_communication")
     539             : 
     540             : #else /* WITH_MPI */
     541             : 
     542           0 :             hv(1:nb) = hv_s(1:nb)
     543             : 
     544             : #endif /* WITH_MPI */
     545             : 
     546             : #else /* WITH_OPENMP */
     547             : 
     548             : #ifdef WITH_MPI
     549      824244 :             if (wantDebug) call obj%timer%start("mpi_communication")
     550             : 
     551             :             call mpi_recv(hv, nb,          &
     552             : #if REALCASE == 1
     553             :                           MPI_REAL_PRECISION, &
     554             : #endif
     555             : #if COMPLEXCASE == 1
     556             :                           MPI_COMPLEX_EXPLICIT_PRECISION, &
     557             : #endif
     558      824244 :                           my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
     559      824244 :             if (wantDebug) call obj%timer%stop("mpi_communication")
     560             : 
     561             : #else /* WITH_MPI */
     562           0 :             hv(1:nb) = hv_s(1:nb)
     563             : #endif /* WITH_MPI */
     564             : 
     565             : #endif /* WITH_OPENMP */
     566     1648488 :             tau = hv(1)
     567     1648488 :             hv(1) = 1.0_rck
     568             :           endif
     569             :         endif
     570             : 
     571     8347704 :         na_s = na_s+1
     572     8347704 :         if (na_s-n_off > nb) then
     573      168300 :           ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
     574      168300 :           ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
     575      168300 :           n_off = n_off + nb
     576             :         endif
     577             : 
     578             : #ifdef WITH_OPENMP
     579     3833628 :         if (max_threads > 1) then
     580             : 
     581             :           ! Codepath for OpenMP
     582             : 
     583             :           ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
     584             :           ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
     585             :           ! This simulates the behaviour of the MPI tasks which also work after each other.
     586             :           ! The code would be considerably easier, if the MPI communication would be made within
     587             :           ! the parallel region - this is avoided here since this would require
     588             :           ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
     589             : 
     590           0 :           hv_t(:,1) = hv
     591           0 :           tau_t(1) = tau
     592             : 
     593           0 :           do iter = 1, 2
     594             : 
     595             :             ! iter=1 : work on first block
     596             :             ! iter=2 : work on remaining blocks
     597             :             ! This is done in 2 iterations so that we have a barrier in between:
     598             :             ! After the first iteration, it is guaranteed that the last row of the last block
     599             :             ! is completed by the next thread.
     600             :             ! After the first iteration it is also the place to exchange the last row
     601             :             ! with MPI calls
     602           0 :             call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     603             : 
     604             : !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
     605           0 : !$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
     606             :             do my_thread = 1, max_threads
     607             : 
     608           0 :               if (iter == 1) then
     609           0 :                 my_block_s = omp_block_limits(my_thread-1) + 1
     610           0 :                 my_block_e = my_block_s
     611             :               else
     612           0 :                 my_block_s = omp_block_limits(my_thread-1) + 2
     613           0 :                 my_block_e = omp_block_limits(my_thread)
     614             :               endif
     615             : 
     616           0 :               do iblk = my_block_s, my_block_e
     617             : 
     618           0 :                 ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
     619           0 :                 ne = ns+nb-1                    ! last column in block
     620             : 
     621           0 :                 if (istep<my_thread .or. ns+n_off>na) exit
     622             : 
     623           0 :                 hv = hv_t(:,my_thread)
     624           0 :                 tau = tau_t(my_thread)
     625             : 
     626             :                 ! Store Householder Vector for back transformation
     627             : 
     628           0 :                 hh_cnt(iblk) = hh_cnt(iblk) + 1
     629             : 
     630           0 :                 hh_gath(1   ,hh_cnt(iblk),iblk) = tau
     631           0 :                 hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
     632             : 
     633           0 :                 nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
     634           0 :                 nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
     635             :                                           ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
     636             : 
     637             :                 ! Transform diagonal block
     638           0 :                 if (wantDebug) call obj%timer%start("blas")
     639             : #if REALCASE == 1
     640           0 :                 call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
     641             : #endif
     642             : #if COMPLEXCASE == 1
     643           0 :                 call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
     644             : #endif
     645           0 :                 if (wantDebug) call obj%timer%stop("blas")
     646             : #if REALCASE == 1
     647           0 :                 x = dot_product(hv(1:nc),hd(1:nc))*tau
     648             : #endif
     649             : #if COMPLEXCASE == 1
     650           0 :                 x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
     651             : #endif
     652           0 :                 hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
     653           0 :                 if (wantDebug) call obj%timer%start("blas")
     654             : #if REALCASE == 1
     655           0 :                 call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
     656             : #endif
     657             : #if COMPLEXCASE == 1
     658           0 :                 call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
     659             : #endif
     660           0 :                 if (wantDebug) call obj%timer%stop("blas")
     661           0 :                 hv_t(:,my_thread) = 0.0_rck
     662           0 :                 tau_t(my_thread)  = 0.0_rck
     663           0 :                 if (nr<=0) cycle ! No subdiagonal block present any more
     664             : 
     665             :                 ! Transform subdiagonal block
     666           0 :                 if (wantDebug) call obj%timer%start("blas")
     667           0 :                 call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
     668           0 :                 if (wantDebug) call obj%timer%stop("blas")
     669           0 :                 if (nr>1) then
     670             : 
     671             :                   ! complete (old) Householder transformation for first column
     672             : 
     673           0 :                   ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
     674             : 
     675             :                   ! calculate new Householder transformation for first column
     676             :                   ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
     677             : 
     678             : #if REALCASE == 1
     679           0 :                   vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
     680             : #endif
     681             : #if COMPLEXCASE == 1
     682             : #ifdef  DOUBLE_PRECISION_COMPLEX
     683           0 :                   vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
     684             : #else
     685           0 :                   vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
     686             : #endif
     687             : #endif /* COMPLEXCASE */
     688             : 
     689             : #if REALCASE == 1
     690             :                   call hh_transform_real_&
     691             : #endif
     692             : #if COMPLEXCASE == 1
     693             :                   call hh_transform_complex_&
     694             : #endif
     695             :                   &PRECISION &
     696           0 :                         (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
     697             : 
     698           0 :                   hv_t(1   ,my_thread) = 1.0_rck
     699           0 :                   hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
     700           0 :                   ab(nb+2:,ns) = 0.0_rck
     701             :                   ! update subdiagonal block for old and new Householder transformation
     702             :                   ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
     703           0 :                   if (wantDebug) call obj%timer%start("blas")
     704             :                   call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,            &
     705           0 :                           nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, ZERO, h(2), 1)
     706           0 :                   if (wantDebug) call obj%timer%stop("blas")
     707             : 
     708           0 :                   x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
     709           0 :                   h(2:nb) = h(2:nb) - x*hv(2:nb)
     710             :                   ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
     711           0 :                   do i=2,nb
     712             :                     ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
     713             : #if REALCASE == 1
     714           0 :                                       h(i) - hs(1:nr)*hv(i)
     715             : #endif
     716             : #if COMPLEXCASE == 1
     717           0 :                                                   conjg(h(i)) - hs(1:nr)*conjg(hv(i))
     718             : #endif
     719             :                   enddo
     720             : 
     721             :                 else
     722             : 
     723             :                   ! No new Householder transformation for nr=1, just complete the old one
     724           0 :                   ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
     725           0 :                   do i=2,nb
     726             : #if REALCASE == 1
     727           0 :                     ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
     728             : #endif
     729             : #if COMPLEXCASE == 1
     730           0 :                     ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
     731             : #endif
     732             :                   enddo
     733             :                   ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
     734           0 :                   hv_t(1,my_thread) = 1.0_rck
     735             :                 endif
     736             : 
     737             :               enddo
     738             : 
     739             :             enddo ! my_thread
     740             : !$omp end parallel do
     741             : 
     742           0 :             call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     743             : 
     744           0 :             if (iter==1) then
     745             :               ! We are at the end of the first block
     746             : 
     747             :               ! Send our first column to previous PE
     748           0 :               if (my_pe>0 .and. na_s <= na) then
     749             : #ifdef WITH_MPI
     750           0 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     751           0 :                 call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
     752           0 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     753             : 
     754             : #endif
     755           0 :                 ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
     756             : #ifdef WITH_MPI
     757           0 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     758             :                 call mpi_isend(ab_s, nb+1,  &
     759             : #if REALCASE == 1
     760             :                    MPI_REAL_PRECISION, &
     761             : #endif
     762             : #if COMPLEXCASE == 1
     763             :                                MPI_COMPLEX_EXPLICIT_PRECISION, &
     764             : #endif
     765           0 :              my_pe-1, 1, communicator, ireq_ab, mpierr)
     766           0 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     767             : 
     768             : #endif /* WITH_MPI */
     769             :               endif
     770             : 
     771             :               ! Request last column from next PE
     772           0 :               ne = na_s + nblocks*nb - (max_threads-1) - 1
     773             : #ifdef WITH_MPI
     774           0 :               if (wantDebug) call obj%timer%start("mpi_communication")
     775             : 
     776           0 :               if (istep>=max_threads .and. ne <= na) then
     777             :                 call mpi_recv(ab(1,ne-n_off), nb+1,       &
     778             : #if REALCASE == 1
     779             :                   MPI_REAL_PRECISION, &
     780             : #endif
     781             : #if COMPLEXCASE == 1
     782             :                               MPI_COMPLEX_EXPLICIT_PRECISION,  &
     783             : #endif
     784           0 :                               my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
     785             :               endif
     786           0 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     787             : #else /* WITH_MPI */
     788           0 :               if (istep>=max_threads .and. ne <= na) then
     789           0 :                 ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
     790             :               endif
     791             : #endif /* WITH_MPI */
     792             :             else
     793             :               ! We are at the end of all blocks
     794             : 
     795             :               ! Send last HH Vector and TAU to next PE if it has been calculated above
     796           0 :               ne = na_s + nblocks*nb - (max_threads-1) - 1
     797           0 :               if (istep>=max_threads .and. ne < na) then
     798             : #ifdef WITH_MPI
     799           0 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     800           0 :                 call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
     801           0 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     802             : #endif
     803           0 :                 hv_s(1) = tau_t(max_threads)
     804           0 :                 hv_s(2:) = hv_t(2:,max_threads)
     805             : 
     806             : #ifdef WITH_MPI
     807           0 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     808             :                 call mpi_isend(hv_s, nb,     &
     809             : #if REALCASE == 1
     810             :                    MPI_REAL_PRECISION, &
     811             : #endif
     812             : #if COMPLEXCASE == 1
     813             :                                MPI_COMPLEX_EXPLICIT_PRECISION, &
     814             : #endif
     815           0 :                                my_pe+1, 2, communicator, ireq_hv, mpierr)
     816           0 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     817             : 
     818             : #endif /* WITH_MPI */
     819             :               endif
     820             : 
     821             :               ! "Send" HH Vector and TAU to next OpenMP thread
     822           0 :               do my_thread = max_threads, 2, -1
     823           0 :                 hv_t(:,my_thread) = hv_t(:,my_thread-1)
     824           0 :                 tau_t(my_thread)  = tau_t(my_thread-1)
     825             :               enddo
     826             : 
     827             :             endif
     828             :           enddo ! iter
     829             : 
     830             :         else
     831             : 
     832             :           ! Codepath for 1 thread without OpenMP
     833             : 
     834             :           ! The following code is structured in a way to keep waiting times for
     835             :           ! other PEs at a minimum, especially if there is only one block.
     836             :           ! For this reason, it requests the last column as late as possible
     837             :           ! and sends the Householder Vector and the first column as early
     838             :           ! as possible.
     839             : 
     840             : #endif /* WITH_OPENMP */
     841             : 
     842    37016424 :           do iblk=1,nblocks
     843    33908016 :             ns = na_s + (iblk-1)*nb - n_off ! first column in block
     844    33908016 :             ne = ns+nb-1                    ! last column in block
     845             : 
     846    33908016 :             if (ns+n_off>na) exit
     847             : 
     848             :             ! Store Householder Vector for back transformation
     849             : 
     850    28668720 :             hh_cnt(iblk) = hh_cnt(iblk) + 1
     851             : 
     852    28668720 :             hh_gath(1   ,hh_cnt(iblk),iblk) = tau
     853    28668720 :             hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
     854             : 
     855             : #ifndef WITH_OPENMP
     856    14334360 :             if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
     857             :               ! Wait for last transfer to finish
     858             : #ifdef WITH_MPI
     859       69204 :               if (wantDebug) call obj%timer%start("mpi_communication")
     860             : 
     861       69204 :               call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
     862       69204 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     863             : #endif
     864             :               ! Copy vectors into send buffer
     865      107760 :               hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
     866             :               ! Send to destination
     867             : 
     868             : #ifdef WITH_MPI
     869       69204 :               if (wantDebug) call obj%timer%start("mpi_communication")
     870             :               call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),  &
     871             : #if REALCASE == 1
     872             :                        MPI_REAL_PRECISION, &
     873             : #endif
     874             : #if COMPLEXCASE == 1
     875             :                              MPI_COMPLEX_EXPLICIT_PRECISION, &
     876             : #endif
     877             :                              global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
     878       69204 :                              10+iblk, communicator, ireq_hhs(iblk), mpierr)
     879       69204 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     880             : #else /* WITH_MPI */
     881             :              ! do the post-poned irecv here
     882       38556 :              startAddr = startAddr - hh_cnt(iblk)
     883       38556 :              hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
     884             : #endif /* WITH_MPI */
     885             : 
     886             :             ! Reset counter and increase destination row
     887      107760 :               hh_cnt(iblk) = 0
     888      107760 :               hh_dst(iblk) = hh_dst(iblk)+1
     889             :             endif
     890             : 
     891             :             ! The following code is structured in a way to keep waiting times for
     892             :             ! other PEs at a minimum, especially if there is only one block.
     893             :             ! For this reason, it requests the last column as late as possible
     894             :             ! and sends the Householder Vector and the first column as early
     895             :             ! as possible.
     896             : #endif /* WITH_OPENMP */
     897    28668720 :             nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
     898    28668720 :             nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
     899             :                                           ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
     900             : 
     901             :             ! Multiply diagonal block and subdiagonal block with Householder Vector
     902             : 
     903    28668720 :             if (iblk==nblocks .and. nc==nb) then
     904             : 
     905             :               ! We need the last column from the next PE.
     906             :               ! First do the matrix multiplications without last column ...
     907             : 
     908             :               ! Diagonal block, the contribution of the last element is added below!
     909     1664304 :               ab(1,ne) = 0.0_rck
     910     1664304 :               if (wantDebug) call obj%timer%start("blas")
     911             : 
     912             : #if REALCASE == 1
     913      971472 :               call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
     914             : #endif
     915             : #if COMPLEXCASE == 1
     916      692832 :               call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd,1)
     917             : #endif
     918             :               ! Subdiagonal block
     919     1664304 :               if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
     920     1664304 :               if (wantDebug) call obj%timer%stop("blas")
     921             : 
     922             :               ! ... then request last column ...
     923             : #ifdef WITH_MPI
     924     1664304 :               if (wantDebug) call obj%timer%start("mpi_communication")
     925             : #ifdef WITH_OPENMP
     926             :               call mpi_recv(ab(1,ne), nb+1,    &
     927             : #if REALCASE == 1
     928             :                       MPI_REAL_PRECISION, &
     929             : #endif
     930             : #if COMPLEXCASE == 1
     931             :                             MPI_COMPLEX_EXPLICIT_PRECISION,  &
     932             : #endif
     933      832152 :           my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
     934             : #else /* WITH_OPENMP */
     935             :               call mpi_recv(ab(1,ne), nb+1,     &
     936             : #if REALCASE == 1
     937             :                       MPI_REAL_PRECISION, &
     938             : #endif
     939             : #if COMPLEXCASE == 1
     940             :                             MPI_COMPLEX_EXPLICIT_PRECISION,  &
     941             : #endif
     942      832152 :                       my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
     943             : #endif /* WITH_OPENMP */
     944     1664304 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     945             : #else /* WITH_MPI */
     946             : 
     947           0 :               ab(1:nb+1,ne) = ab_s(1:nb+1)
     948             : 
     949             : #endif /* WITH_MPI */
     950             : 
     951             :               ! ... and complete the result
     952     1664304 :               hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
     953     1664304 :               hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
     954             : 
     955             :             else
     956             : 
     957             :               ! Normal matrix multiply
     958    27004416 :               if (wantDebug) call obj%timer%start("blas")
     959             : #if REALCASE == 1
     960    10217760 :               call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
     961             : #endif
     962             : #if COMPLEXCASE == 1
     963    16786656 :               call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
     964             : #endif
     965    27004416 :               if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
     966    27004416 :               if (wantDebug) call obj%timer%stop("blas")
     967             :             endif
     968             : 
     969             :             ! Calculate first column of subdiagonal block and calculate new
     970             :             ! Householder transformation for this column
     971    28668720 :             hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
     972    28668720 :             tau_new = 0.0_rck
     973    28668720 :             if (nr>0) then
     974             : 
     975             :               ! complete (old) Householder transformation for first column
     976             : 
     977    22649952 :               ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
     978             : 
     979             :               ! calculate new Householder transformation ...
     980    22649952 :               if (nr>1) then
     981             : #if  REALCASE == 1
     982     7646976 :                 vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
     983             : #endif
     984             : #if COMPLEXCASE == 1
     985             : #ifdef DOUBLE_PRECISION_COMPLEX
     986    10884480 :                 vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
     987             : #else
     988     3995904 :                 vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
     989             : #endif
     990             : #endif /* COMPLEXCASE */
     991             : 
     992             : #if REALCASE == 1
     993             :                 call hh_transform_real_&
     994             : #endif
     995             : #if COMPLEXCASE == 1
     996             :                 call hh_transform_complex_&
     997             : #endif
     998             :                 &PRECISION &
     999    22527360 :                              (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug)
    1000    22527360 :                 hv_new(1) = 1.0_rck
    1001    22527360 :                 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
    1002    22527360 :                 ab(nb+2:,ns) = 0.0_rck
    1003             :               endif ! nr > 1
    1004             : 
    1005             :               ! ... and send it away immediatly if this is the last block
    1006             : 
    1007    22649952 :               if (iblk==nblocks) then
    1008             : #ifdef WITH_MPI
    1009     1648488 :                 if (wantDebug) call obj%timer%start("mpi_communication")
    1010             : #ifdef WITH_OPENMP
    1011      824244 :                 call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
    1012             : #else
    1013      824244 :                 call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
    1014             : #endif
    1015     1648488 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
    1016             : 
    1017             : #endif /* WITH_MPI */
    1018     1648488 :                 hv_s(1) = tau_new
    1019     1648488 :                 hv_s(2:) = hv_new(2:)
    1020             : 
    1021             : #ifdef WITH_MPI
    1022     1648488 :                 if (wantDebug) call obj%timer%start("mpi_communication")
    1023             :                 call mpi_isend(hv_s, nb,       &
    1024             : #if REALCASE == 1
    1025             :                    MPI_REAL_PRECISION, &
    1026             : #endif
    1027             : #if COMPLEXCASE == 1
    1028             :                                MPI_COMPLEX_EXPLICIT_PRECISION, &
    1029             : #endif
    1030     1648488 :              my_pe+1, 2, communicator, ireq_hv, mpierr)
    1031     1648488 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
    1032             : 
    1033             : #endif /* WITH_MPI */
    1034             :               endif
    1035             : 
    1036             :             endif
    1037             : 
    1038             :             ! Transform diagonal block
    1039             : #if REALCASE == 1
    1040    11189232 :             x = dot_product(hv(1:nc),hd(1:nc))*tau
    1041             : #endif
    1042             : #if COMPLEXCASE == 1
    1043    17479488 :             x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
    1044             : #endif
    1045    28668720 :             hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
    1046    28668720 :             if (my_pe>0 .and. iblk==1) then
    1047             : 
    1048             :               ! The first column of the diagonal block has to be send to the previous PE
    1049             :               ! Calculate first column only ...
    1050             : #if REALCASE == 1
    1051      961944 :               ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
    1052             : #endif
    1053             : #if COMPLEXCASE == 1
    1054      686544 :               ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
    1055             : #endif
    1056             :               ! ... send it away ...
    1057             : #ifdef WITH_MPI
    1058     1648488 :               if (wantDebug) call obj%timer%start("mpi_communication")
    1059             : 
    1060             : #ifdef WITH_OPENMP
    1061      824244 :               call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
    1062             : #else
    1063      824244 :               call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
    1064             : #endif
    1065     1648488 :               if (wantDebug) call obj%timer%stop("mpi_communication")
    1066             : 
    1067             : #endif /* WITH_MPI */
    1068     1648488 :               ab_s(1:nb+1) = ab(1:nb+1,ns)
    1069             : 
    1070             : #ifdef WITH_MPI
    1071     1648488 :               if (wantDebug) call obj%timer%start("mpi_communication")
    1072             : 
    1073             :               call mpi_isend(ab_s, nb+1,    &
    1074             : #if REALCASE == 1
    1075             :                        MPI_REAL_PRECISION, &
    1076             : #endif
    1077             : #if COMPLEXCASE == 1
    1078             :                              MPI_COMPLEX_EXPLICIT_PRECISION, &
    1079             : #endif
    1080     1648488 :           my_pe-1, 1, communicator, ireq_ab, mpierr)
    1081     1648488 :               if (wantDebug) call obj%timer%stop("mpi_communication")
    1082             : 
    1083             : #endif /* WITH_MPI */
    1084             :               ! ... and calculate remaining columns with rank-2 update
    1085     1648488 :               if (wantDebug) call obj%timer%start("blas")
    1086             : #if REALCASE == 1
    1087      961944 :               if (nc>1) call PRECISION_SYR2('L', nc-1, -ONE, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
    1088             : #endif
    1089             : #if COMPLEXCASE == 1
    1090      686544 :               if (nc>1) call PRECISION_HER2('L', nc-1, -ONE, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
    1091             : #endif
    1092     1648488 :               if (wantDebug) call obj%timer%stop("blas")
    1093             : 
    1094             :             else
    1095             :               ! No need to  send, just a rank-2 update
    1096    27020232 :               if (wantDebug) call obj%timer%start("blas")
    1097             : #if REALCASE == 1
    1098    10227288 :               call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
    1099             : #endif
    1100             : #if COMPLEXCASE == 1
    1101    16792944 :               call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
    1102             : #endif
    1103    27020232 :               if (wantDebug) call obj%timer%stop("blas")
    1104             : 
    1105             :             endif
    1106             : 
    1107             :             ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
    1108             : 
    1109    28668720 :             if (nr>0) then
    1110    22649952 :               if (nr>1) then
    1111    22527360 :                 if (wantDebug) call obj%timer%start("blas")
    1112             :                 call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, &
    1113    22527360 :                                     hv_new, 1, ZERO, h(2), 1)
    1114    22527360 :                 if (wantDebug) call obj%timer%stop("blas")
    1115             : 
    1116    22527360 :                 x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
    1117    22527360 :                 h(2:nb) = h(2:nb) - x*hv(2:nb)
    1118             :                 ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
    1119   954298368 :                 do i=2,nb
    1120             : #if REALCASE == 1
    1121   473299200 :                   ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
    1122             : #endif
    1123             : #if COMPLEXCASE == 1
    1124   458471808 :                   ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
    1125             : #endif
    1126             :                 enddo
    1127             :               else
    1128             :                 ! No double Householder transformation for nr=1, just complete the row
    1129     5360640 :                 do i=2,nb
    1130             : #if REALCASE == 1
    1131     3032640 :                   ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
    1132             : #endif
    1133             : #if COMPLEXCASE == 1
    1134     2205408 :                   ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
    1135             : #endif
    1136             :                 enddo
    1137             :               endif
    1138             :             endif
    1139             : 
    1140             :             ! Use new HH Vector for the next block
    1141    28668720 :             hv(:) = hv_new(:)
    1142    28668720 :             tau = tau_new
    1143             : 
    1144             :           enddo
    1145             : 
    1146             : #ifdef WITH_OPENMP
    1147             :         endif
    1148             : #endif
    1149             : 
    1150             : #if WITH_OPENMP
    1151    18167988 :         do iblk = 1, nblocks
    1152             : 
    1153    16613784 :           if (hh_dst(iblk) >= np_rows) exit
    1154    14735256 :           if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
    1155             : 
    1156    14334360 :           if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
    1157             :             ! Wait for last transfer to finish
    1158             : #ifdef WITH_MPI
    1159       69204 :             if (wantDebug) call obj%timer%start("mpi_communication")
    1160       69204 :             call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
    1161       69204 :             if (wantDebug) call obj%timer%stop("mpi_communication")
    1162             : #endif
    1163             :             ! Copy vectors into send buffer
    1164      107760 :             hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
    1165             :             ! Send to destination
    1166             : 
    1167             : #ifdef WITH_MPI
    1168       69204 :             if (wantDebug) call obj%timer%start("mpi_communication")
    1169             :             call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),    &
    1170             : #if REALCASE == 1
    1171             :                      MPI_REAL_PRECISION, &
    1172             : #endif
    1173             : #if COMPLEXCASE == 1
    1174             :                            MPI_COMPLEX_EXPLICIT_PRECISION, &
    1175             : #endif
    1176             :                            global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
    1177       69204 :                            10+iblk, communicator, ireq_hhs(iblk), mpierr)
    1178       69204 :             if (wantDebug) call obj%timer%stop("mpi_communication")
    1179             : #else /* WITH_MPI */
    1180             :             ! do the post-poned irecv here
    1181       38556 :             startAddr = startAddr - hh_cnt(iblk)
    1182       38556 :             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
    1183             : #endif /* WITH_MPI */
    1184             : 
    1185             :             ! Reset counter and increase destination row
    1186      107760 :             hh_cnt(iblk) = 0
    1187      107760 :             hh_dst(iblk) = hh_dst(iblk)+1
    1188             :           endif
    1189             : 
    1190             :         enddo
    1191             : #endif /* WITH_OPENMP */
    1192             :       enddo ! istep
    1193             : 
    1194             :       ! Finish the last outstanding requests
    1195             : 
    1196             : #ifdef WITH_OPENMP
    1197             : 
    1198             : #ifdef WITH_MPI
    1199       15816 :       if (wantDebug) call obj%timer%start("mpi_communication")
    1200       15816 :       call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
    1201       15816 :       call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
    1202             : 
    1203             : !      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
    1204             : !      if (istat .ne. 0) then
    1205             : !        print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage
    1206             : !        stop 1
    1207             : !      endif
    1208             : 
    1209       15816 :       call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
    1210       15816 :       call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
    1211             : !      deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
    1212             : !      if (istat .ne. 0) then
    1213             : !        print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage
    1214             : !        stop 1
    1215             : !      endif
    1216       15816 :       if (wantDebug) call obj%timer%stop("mpi_communication")
    1217             : #endif /* WITH_MPI */
    1218             : 
    1219             : #else /* WITH_OPENMP */
    1220             : 
    1221             : #ifdef WITH_MPI
    1222       15816 :       if (wantDebug) call obj%timer%start("mpi_communication")
    1223       15816 :       call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
    1224       15816 :       call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
    1225             : 
    1226       15816 :       call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
    1227       15816 :       call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
    1228       15816 :       if (wantDebug) call obj%timer%stop("mpi_communication")
    1229             : #endif
    1230             : 
    1231             : #endif /* WITH_OPENMP */
    1232             : 
    1233             : #ifdef  WITH_MPI
    1234       31632 :       if (wantDebug) call obj%timer%start("mpi_communication")
    1235       31632 :       call mpi_barrier(communicator,mpierr)
    1236       31632 :       if (wantDebug) call obj%timer%stop("mpi_communication")
    1237             : #endif
    1238       47448 :       deallocate(ab, stat=istat, errmsg=errorMessage)
    1239       47448 :       if (istat .ne. 0) then
    1240             :         print *,"tridiag_band_&
    1241             :                 &MATH_DATATYPE&
    1242           0 :                 &: error when deallocating ab"//errorMessage
    1243           0 :         stop 1
    1244             :       endif
    1245             : 
    1246       47448 :       deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
    1247       47448 :       if (istat .ne. 0) then
    1248             :         print *,"tridiag_band_&
    1249             :                  &MATH_DATATYPE&
    1250           0 :                  &: error when deallocating ireq_hhr, ireq_hhs"//errorMessage
    1251           0 :         stop 1
    1252             :       endif
    1253             : 
    1254       47448 :       deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
    1255       47448 :        if (istat .ne. 0) then
    1256             :          print *,"tridiag_band_&
    1257             :                  &MATH_DATATYPE&
    1258           0 :                  &: error when deallocating hh_cnt, hh_dst"//errorMessage
    1259           0 :          stop 1
    1260             :        endif
    1261             : 
    1262       47448 :       deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
    1263       47448 :        if (istat .ne. 0) then
    1264             :          print *,"tridiag_band_&
    1265             :                  &MATH_DATATYPE&
    1266           0 :                  &: error when deallocating hh_gath, hh_send"//errorMessage
    1267           0 :          stop 1
    1268             :        endif
    1269             : 
    1270       47448 :       deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
    1271       47448 :        if (istat .ne. 0) then
    1272             :          print *,"tridiag_band_&
    1273             :                  &MATH_DATATYPE&
    1274           0 :                  &: error when deallocating limits, send_limits"//errorMessage
    1275           0 :          stop 1
    1276             :        endif
    1277             : 
    1278       47448 :       deallocate(block_limits, stat=istat, errmsg=errorMessage)
    1279       47448 :        if (istat .ne. 0) then
    1280             :          print *,"tridiag_band_&
    1281             :                  &MATH_DATATYPE&
    1282           0 :                  &: error when deallocating block_limits"//errorMessage
    1283           0 :          stop 1
    1284             :        endif
    1285             : 
    1286       47448 :       deallocate(global_id, stat=istat, errmsg=errorMessage)
    1287       47448 :        if (istat .ne. 0) then
    1288             :          print *,"tridiag_band_&
    1289             :                   &MATH_DATATYPE&
    1290           0 :                   &: error when allocating global_id"//errorMessage
    1291           0 :          stop 1
    1292             :        endif
    1293             : 
    1294             :       call obj%timer%stop("tridiag_band_&
    1295             :       &MATH_DATATYPE&
    1296             :       &" // &
    1297             :       &PRECISION_SUFFIX&
    1298       47448 :       )
    1299             : 
    1300             : ! intel compiler bug makes these ifdefs necessary
    1301             : #if REALCASE == 1
    1302             :     end subroutine tridiag_band_real_&
    1303             : #endif
    1304             : #if COMPLEXCASE == 1
    1305             :     end subroutine tridiag_band_complex_&
    1306             : #endif
    1307      189792 :     &PRECISION
    1308             :

Generated by: LCOV version 1.12