LCOV - code coverage report
Current view: top level - src/elpa1 - elpa1_merge_systems_real_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 319 503 63.4 %
Date: 2018-01-10 09:29:53 Functions: 120 152 78.9 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : !
      47             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      48             : !
      49             : ! Copyright of the original code rests with the authors inside the ELPA
      50             : ! consortium. The copyright of any additional modifications shall rest
      51             : ! with their original authors, but shall adhere to the licensing terms
      52             : ! distributed along with the original code in the file "COPYING".
      53             : #endif
      54             : 
      55             : #include "../general/sanity.F90"
      56             : 
      57             :     subroutine merge_systems_&
      58       39504 :     &PRECISION &
      59       79008 :                          (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
      60       79008 :                           l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success)
      61             :       use cuda_functions
      62             :       use iso_c_binding
      63             :       use precision
      64             :       use elpa_abstract_impl
      65             :       implicit none
      66             : #include "../general/precision_kinds.F90"
      67             :       class(elpa_abstract_impl_t), intent(inout) :: obj
      68             :       integer(kind=ik), intent(in)               :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
      69             :                                                     mpi_comm_cols, npc_0, npc_n
      70             :       integer(kind=ik), intent(in)                :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
      71             :       real(kind=REAL_DATATYPE), intent(inout)     :: d(na), e
      72             : #ifdef USE_ASSUMED_SIZE
      73             :       real(kind=REAL_DATATYPE), intent(inout)     :: q(ldq,*)
      74             : #else
      75             :       real(kind=REAL_DATATYPE), intent(inout)     :: q(ldq,matrixCols)
      76             : #endif
      77             :       logical, intent(in)           :: useGPU, wantDebug
      78             :       logical, intent(out)          :: success
      79             : 
      80             :       ! TODO: play with max_strip. If it was larger, matrices being multiplied
      81             :       ! might be larger as well!
      82             :       integer(kind=ik), parameter   :: max_strip=128
      83             : 
      84             :       real(kind=REAL_DATATYPE)                 :: PRECISION_LAMCH, PRECISION_LAPY2
      85             :       real(kind=REAL_DATATYPE)                 :: beta, sig, s, c, t, tau, rho, eps, tol, &
      86             :                                        qtrans(2,2), dmax, zmax, d1new, d2new
      87      316032 :       real(kind=REAL_DATATYPE)                 :: z(na), d1(na), d2(na), z1(na), delta(na),  &
      88      237024 :                                        dbase(na), ddiff(na), ev_scale(na), tmp(na)
      89      158016 :       real(kind=REAL_DATATYPE)                 :: d1u(na), zu(na), d1l(na), zl(na)
      90       79008 :       real(kind=REAL_DATATYPE), allocatable    :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
      91             : #ifdef WITH_OPENMP
      92       19752 :       real(kind=REAL_DATATYPE), allocatable    :: z_p(:,:)
      93             : #endif
      94             : 
      95             :       integer(kind=ik)              :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
      96             :                                        l_rqm, ns, info
      97             :       integer(kind=ik)              :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
      98             :                                        l_cols_qreorg, np, l_idx, nqcols1, nqcols2
      99             :       integer(kind=ik)              :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
     100             :                                        np_cols, mpierr
     101             :       integer(kind=ik)              :: np_next, np_prev, np_rem
     102       79008 :       integer(kind=ik)              :: idx(na), idx1(na), idx2(na)
     103      158016 :       integer(kind=ik)              :: coltyp(na), idxq1(na), idxq2(na)
     104             : 
     105             :       integer(kind=ik)              :: istat
     106             :       character(200)                :: errorMessage
     107             :       integer(kind=ik)              :: gemm_dim_k, gemm_dim_l, gemm_dim_m
     108             : 
     109             :       integer(kind=C_intptr_T)                        :: qtmp1_dev, qtmp2_dev, ev_dev
     110             :       logical                                         :: successCUDA
     111             :       integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
     112             :                                                                           &PRECISION&
     113             :                                                                           &_real
     114             : #ifdef WITH_OPENMP
     115             :       integer(kind=ik)              :: max_threads, my_thread
     116             :       integer(kind=ik)              :: omp_get_max_threads, omp_get_thread_num
     117             : 
     118             : 
     119       19752 :       max_threads = omp_get_max_threads()
     120             : 
     121       19752 :       allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
     122       19752 :       if (istat .ne. 0) then
     123           0 :         print *,"merge_systems: error when allocating z_p "//errorMessage
     124           0 :         stop 1
     125             :       endif
     126             : #endif
     127             : 
     128       39504 :       call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
     129       39504 :       success = .true.
     130       39504 :       call obj%timer%start("mpi_communication")
     131       39504 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     132       39504 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     133       39504 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     134       39504 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     135       39504 :       call obj%timer%stop("mpi_communication")
     136             : 
     137             :       ! If my processor column isn't in the requested set, do nothing
     138             : 
     139       39504 :       if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
     140           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     141           0 :         return
     142             :       endif
     143             :       ! Determine number of "next" and "prev" column for ring sends
     144             : 
     145       39504 :       if (my_pcol == npc_0+npc_n-1) then
     146       39504 :         np_next = npc_0
     147             :       else
     148           0 :         np_next = my_pcol + 1
     149             :       endif
     150             : 
     151       39504 :       if (my_pcol == npc_0) then
     152       39504 :         np_prev = npc_0+npc_n-1
     153             :       else
     154           0 :         np_prev = my_pcol - 1
     155             :       endif
     156             :       call check_monotony_&
     157             :       &PRECISION&
     158       39504 :       &(obj, nm,d,'Input1',wantDebug, success)
     159       39504 :       if (.not.(success)) then
     160           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     161           0 :         return
     162             :       endif
     163             :       call check_monotony_&
     164             :       &PRECISION&
     165       39504 :       &(obj,na-nm,d(nm+1),'Input2',wantDebug, success)
     166       39504 :       if (.not.(success)) then
     167           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     168           0 :         return
     169             :       endif
     170             :       ! Get global number of processors and my processor number.
     171             :       ! Please note that my_proc does not need to match any real processor number,
     172             :       ! it is just used for load balancing some loops.
     173             : 
     174       39504 :       n_procs = np_rows*npc_n
     175       39504 :       my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
     176             : 
     177             : 
     178             :       ! Local limits of the rows of Q
     179             : 
     180       39504 :       l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
     181       39504 :       l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
     182       39504 :       l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
     183             : 
     184       39504 :       l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm
     185       39504 :       l_rows = l_rqe-l_rqs+1 ! Total number of local rows
     186             : 
     187             : 
     188             :       ! My number of local columns
     189             : 
     190       39504 :       l_cols = COUNT(p_col(1:na)==my_pcol)
     191             : 
     192             :       ! Get max number of local columns
     193             : 
     194       39504 :       max_local_cols = 0
     195       79008 :       do np = npc_0, npc_0+npc_n-1
     196       39504 :         max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
     197             :       enddo
     198             : 
     199             :       ! Calculations start here
     200             : 
     201       39504 :       beta = abs(e)
     202       39504 :       sig  = sign(1.0_rk,e)
     203             : 
     204             :       ! Calculate rank-1 modifier z
     205             : 
     206       39504 :       z(:) = 0
     207             : 
     208       39504 :       if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
     209             :         ! nm is local on my row
     210     5198376 :         do i = 1, na
     211     5178000 :           if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
     212             :          enddo
     213             :       endif
     214             : 
     215       39504 :       if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
     216             :         ! nm+1 is local on my row
     217     5198376 :         do i = 1, na
     218     5178000 :           if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
     219             :         enddo
     220             :       endif
     221             : 
     222             :       call global_gather_&
     223             :       &PRECISION&
     224       39504 :       &(obj, z, na)
     225             :       ! Normalize z so that norm(z) = 1.  Since z is the concatenation of
     226             :       ! two normalized vectors, norm2(z) = sqrt(2).
     227       39504 :       z = z/sqrt(2.0_rk)
     228       39504 :       rho = 2.0_rk*beta
     229             :       ! Calculate index for merging both systems by ascending eigenvalues
     230       39504 :       call obj%timer%start("blas")
     231       39504 :       call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
     232       39504 :       call obj%timer%stop("blas")
     233             : 
     234             : ! Calculate the allowable deflation tolerance
     235             : 
     236       39504 :       zmax = maxval(abs(z))
     237       39504 :       dmax = maxval(abs(d))
     238       39504 :       EPS = PRECISION_LAMCH( 'Epsilon' )
     239       39504 :       TOL = 8.0_rk*EPS*MAX(dmax,zmax)
     240             : 
     241             :       ! If the rank-1 modifier is small enough, no more needs to be done
     242             :       ! except to reorganize D and Q
     243             : 
     244       39504 :       IF ( RHO*zmax <= TOL ) THEN
     245             : 
     246             :         ! Rearrange eigenvalues
     247             : 
     248           0 :         tmp = d
     249           0 :         do i=1,na
     250           0 :           d(i) = tmp(idx(i))
     251             :         enddo
     252             : 
     253             :         ! Rearrange eigenvectors
     254             :         call resort_ev_&
     255             :         &PRECISION &
     256           0 :                        (obj, idx, na)
     257             : 
     258           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     259             : 
     260           0 :         return
     261             :       ENDIF
     262             : 
     263             :       ! Merge and deflate system
     264             : 
     265       39504 :       na1 = 0
     266       39504 :       na2 = 0
     267             : 
     268             :       ! COLTYP:
     269             :       ! 1 : non-zero in the upper half only;
     270             :       ! 2 : dense;
     271             :       ! 3 : non-zero in the lower half only;
     272             :       ! 4 : deflated.
     273             : 
     274       39504 :       coltyp(1:nm) = 1
     275       39504 :       coltyp(nm+1:na) = 3
     276             : 
     277     9147504 :       do i=1,na
     278             : 
     279     9108000 :         if (rho*abs(z(idx(i))) <= tol) then
     280             : 
     281             :           ! Deflate due to small z component.
     282             : 
     283      919536 :           na2 = na2+1
     284      919536 :           d2(na2)   = d(idx(i))
     285      919536 :           idx2(na2) = idx(i)
     286      919536 :           coltyp(idx(i)) = 4
     287             : 
     288     8188464 :         else if (na1>0) then
     289             : 
     290             :           ! Check if eigenvalues are close enough to allow deflation.
     291             : 
     292     8148960 :           S = Z(idx(i))
     293     8148960 :           C = Z1(na1)
     294             : 
     295             :           ! Find sqrt(a**2+b**2) without overflow or
     296             :           ! destructive underflow.
     297     8148960 :           TAU = PRECISION_LAPY2( C, S )
     298     8148960 :           T = D1(na1) - D(idx(i))
     299     8148960 :           C = C / TAU
     300     8148960 :           S = -S / TAU
     301     8148960 :           IF ( ABS( T*C*S ) <= TOL ) THEN
     302             : 
     303             :             ! Deflation is possible.
     304             : 
     305       45024 :             na2 = na2+1
     306             : 
     307       45024 :             Z1(na1) = TAU
     308             : 
     309       45024 :             d2new = D(idx(i))*C**2 + D1(na1)*S**2
     310       45024 :             d1new = D(idx(i))*S**2 + D1(na1)*C**2
     311             : 
     312             :             ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
     313             :             ! This means that after the above transformation it must be
     314             :             !    D1(na1) <= d1new <= D(idx(i))
     315             :             !    D1(na1) <= d2new <= D(idx(i))
     316             :             !
     317             :             ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
     318             :             ! so there is no problem with sorting here.
     319             :             ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
     320             :             ! which makes a check (and possibly a resort) necessary.
     321             :             !
     322             :             ! The above relations may not hold exactly due to numeric differences
     323             :             ! so they have to be enforced in order not to get troubles with sorting.
     324             : 
     325             : 
     326       45024 :             if (d1new<D1(na1)  ) d1new = D1(na1)
     327       45024 :             if (d1new>D(idx(i))) d1new = D(idx(i))
     328             : 
     329       45024 :             if (d2new<D1(na1)  ) d2new = D1(na1)
     330       45024 :             if (d2new>D(idx(i))) d2new = D(idx(i))
     331             : 
     332       45024 :             D1(na1) = d1new
     333             : 
     334       45024 :             do j=na2-1,1,-1
     335       45024 :               if (d2new<d2(j)) then
     336           0 :                 d2(j+1)   = d2(j)
     337           0 :                 idx2(j+1) = idx2(j)
     338             :               else
     339       45024 :                 exit ! Loop
     340             :               endif
     341             :             enddo
     342             : 
     343       45024 :             d2(j+1)   = d2new
     344       45024 :             idx2(j+1) = idx(i)
     345             : 
     346       45024 :             qtrans(1,1) = C; qtrans(1,2) =-S
     347       45024 :             qtrans(2,1) = S; qtrans(2,2) = C
     348             :             call transform_columns_&
     349             :             &PRECISION &
     350       45024 :                         (obj, idx(i), idx1(na1))
     351       45024 :             if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
     352       45024 :             if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
     353             : 
     354       45024 :             coltyp(idx(i)) = 4
     355             : 
     356             :           else
     357     8103936 :             na1 = na1+1
     358     8103936 :             d1(na1) = d(idx(i))
     359     8103936 :             z1(na1) = z(idx(i))
     360     8103936 :             idx1(na1) = idx(i)
     361             :           endif
     362             :         else
     363       39504 :           na1 = na1+1
     364       39504 :           d1(na1) = d(idx(i))
     365       39504 :           z1(na1) = z(idx(i))
     366       39504 :           idx1(na1) = idx(i)
     367             :         endif
     368             : 
     369             :       enddo
     370             :       call check_monotony_&
     371             :       &PRECISION&
     372       39504 :       &(obj, na1,d1,'Sorted1', wantDebug, success)
     373       39504 :       if (.not.(success)) then
     374           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     375           0 :         return
     376             :       endif
     377             :       call check_monotony_&
     378             :       &PRECISION&
     379       39504 :       &(obj, na2,d2,'Sorted2', wantDebug, success)
     380       39504 :       if (.not.(success)) then
     381           0 :         call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     382           0 :         return
     383             :       endif
     384             : 
     385       39504 :       if (na1==1 .or. na1==2) then
     386             :         ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
     387             : 
     388           0 :         if (na1==1) then
     389           0 :           d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
     390             :         else ! na1==2
     391           0 :           call obj%timer%start("blas")
     392           0 :           call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
     393           0 :           call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
     394           0 :           call obj%timer%stop("blas")
     395             :           call transform_columns_&
     396             :           &PRECISION&
     397           0 :           &(obj, idx1(1), idx1(2))
     398             :         endif
     399             : 
     400             :         ! Add the deflated eigenvalues
     401           0 :         d(na1+1:na) = d2(1:na2)
     402             : 
     403             :         ! Calculate arrangement of all eigenvalues  in output
     404           0 :         call obj%timer%start("blas")
     405           0 :         call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
     406           0 :         call obj%timer%stop("blas")
     407             :         ! Rearrange eigenvalues
     408             : 
     409           0 :         tmp = d
     410           0 :         do i=1,na
     411           0 :           d(i) = tmp(idx(i))
     412             :         enddo
     413             : 
     414             :         ! Rearrange eigenvectors
     415             : 
     416           0 :         do i=1,na
     417           0 :           if (idx(i)<=na1) then
     418           0 :             idxq1(i) = idx1(idx(i))
     419             :           else
     420           0 :             idxq1(i) = idx2(idx(i)-na1)
     421             :           endif
     422             :         enddo
     423             :         call resort_ev_&
     424             :         &PRECISION&
     425           0 :         &(obj, idxq1, na)
     426       39504 :       else if (na1>2) then
     427             : 
     428             :         ! Solve secular equation
     429             : 
     430       39504 :         z(1:na1) = 1
     431             : #ifdef WITH_OPENMP
     432       19752 :         z_p(1:na1,:) = 1
     433             : #endif
     434       39504 :         dbase(1:na1) = 0
     435       39504 :         ddiff(1:na1) = 0
     436             : 
     437       39504 :         info = 0
     438             : #ifdef WITH_OPENMP
     439             : 
     440       19752 :         call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     441             : 
     442       39504 : !$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
     443       19752 :         my_thread = omp_get_thread_num()
     444       19752 : !$OMP DO
     445             : #endif
     446     2325804 :         DO i = my_proc+1, na1, n_procs ! work distributed over all processors
     447     4612104 :           call obj%timer%start("blas")
     448     4612104 :           call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
     449     4612104 :           call obj%timer%stop("blas")
     450     4612104 :           if (info/=0) then
     451             :             ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
     452             :             ! use the more stable bisection algorithm in solve_secular_equation
     453             :             ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
     454             :             call solve_secular_equation_&
     455             :             &PRECISION&
     456           0 :             &(obj, na1, i, d1, z1, delta, rho, s)
     457             :           endif
     458             : 
     459             :           ! Compute updated z
     460             : 
     461             : #ifdef WITH_OPENMP
     462  1109017344 :           do j=1,na1
     463  2211116532 :             if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
     464             :           enddo
     465     2306052 :           z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
     466             : #else
     467  1109017344 :           do j=1,na1
     468  1106711292 :             if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
     469             :           enddo
     470     2306052 :           z(i) = z(i)*delta(i)
     471             : #endif
     472             :           ! store dbase/ddiff
     473             : 
     474     4612104 :           if (i<na1) then
     475     4591728 :             if (abs(delta(i+1)) < abs(delta(i))) then
     476     2147352 :               dbase(i) = d1(i+1)
     477     2147352 :               ddiff(i) = delta(i+1)
     478             :             else
     479     2444376 :               dbase(i) = d1(i)
     480     2444376 :               ddiff(i) = delta(i)
     481             :             endif
     482             :           else
     483       20376 :             dbase(i) = d1(i)
     484       20376 :             ddiff(i) = delta(i)
     485             :           endif
     486             :         enddo
     487             : #ifdef WITH_OPENMP
     488             : !$OMP END PARALLEL
     489             : 
     490       19752 :         call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     491             : 
     492       39504 :         do i = 0, max_threads-1
     493       19752 :           z(1:na1) = z(1:na1)*z_p(1:na1,i)
     494             :         enddo
     495             : #endif
     496             : 
     497             :         call global_product_&
     498             :         &PRECISION&
     499       39504 :         (obj, z, na1)
     500       39504 :         z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
     501             : 
     502             :         call global_gather_&
     503             :         &PRECISION&
     504       39504 :         &(obj, dbase, na1)
     505             :         call global_gather_&
     506             :         &PRECISION&
     507       39504 :         &(obj, ddiff, na1)
     508       39504 :         d(1:na1) = dbase(1:na1) - ddiff(1:na1)
     509             : 
     510             :         ! Calculate scale factors for eigenvectors
     511       39504 :         ev_scale(:) = 0.0_rk
     512             : 
     513             : #ifdef WITH_OPENMP
     514             : 
     515       19752 :         call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     516             : 
     517             : !$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs,  &
     518             : !$OMP d1,dbase, ddiff, z, ev_scale, obj) &
     519     2325804 : !$OMP DEFAULT(NONE)
     520             : 
     521             : #endif
     522     2325804 :         DO i = my_proc+1, na1, n_procs ! work distributed over all processors
     523             : 
     524             :           ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
     525             :           ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
     526             : 
     527             :           ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
     528             :           ! in exactly this order, but we want to prevent compiler optimization
     529             : !         ev_scale_val = ev_scale(i)
     530             :           call add_tmp_&
     531             :           &PRECISION&
     532     4651608 :           &(obj, d1, dbase, ddiff, z, ev_scale(i), na1,i)
     533             : !         ev_scale(i) = ev_scale_val
     534             :         enddo
     535             : #ifdef WITH_OPENMP
     536             : !$OMP END PARALLEL DO
     537             : 
     538       19752 :         call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     539             : 
     540             : #endif
     541             : 
     542             :         call global_gather_&
     543             :         &PRECISION&
     544       39504 :         &(obj, ev_scale, na1)
     545             :         ! Add the deflated eigenvalues
     546       39504 :         d(na1+1:na) = d2(1:na2)
     547             : 
     548       39504 :         call obj%timer%start("blas")
     549             :         ! Calculate arrangement of all eigenvalues  in output
     550       39504 :         call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
     551       39504 :         call obj%timer%stop("blas")
     552             :         ! Rearrange eigenvalues
     553       39504 :         tmp = d
     554     9147504 :         do i=1,na
     555     9108000 :           d(i) = tmp(idx(i))
     556             :         enddo
     557             :         call check_monotony_&
     558             :         &PRECISION&
     559       39504 :         &(obj, na,d,'Output', wantDebug, success)
     560             : 
     561       39504 :         if (.not.(success)) then
     562           0 :           call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     563           0 :           return
     564             :         endif
     565             :         ! Eigenvector calculations
     566             : 
     567             : 
     568             :         ! Calculate the number of columns in the new local matrix Q
     569             :         ! which are updated from non-deflated/deflated eigenvectors.
     570             :         ! idxq1/2 stores the global column numbers.
     571             : 
     572       39504 :         nqcols1 = 0 ! number of non-deflated eigenvectors
     573       39504 :         nqcols2 = 0 ! number of deflated eigenvectors
     574     9147504 :         DO i = 1, na
     575     9108000 :           if (p_col_out(i)==my_pcol) then
     576     7236000 :             if (idx(i)<=na1) then
     577     6526704 :               nqcols1 = nqcols1+1
     578     6526704 :               idxq1(nqcols1) = i
     579             :             else
     580      709296 :               nqcols2 = nqcols2+1
     581      709296 :               idxq2(nqcols2) = i
     582             :             endif
     583             :           endif
     584             :         enddo
     585             : 
     586       39504 :         gemm_dim_k = MAX(1,l_rows)
     587       39504 :         gemm_dim_l = max_local_cols
     588       39504 :         gemm_dim_m = MIN(max_strip,MAX(1,nqcols1))
     589             : 
     590       39504 :         allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errorMessage)
     591       39504 :         if (istat .ne. 0) then
     592           0 :           print *,"merge_systems: error when allocating qtmp1 "//errorMessage
     593           0 :           stop 1
     594             :         endif
     595             : 
     596       39504 :         allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errorMessage)
     597       39504 :         if (istat .ne. 0) then
     598           0 :           print *,"merge_systems: error when allocating ev "//errorMessage
     599           0 :           stop 1
     600             :         endif
     601             : 
     602       39504 :         allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errorMessage)
     603       39504 :         if (istat .ne. 0) then
     604           0 :           print *,"merge_systems: error when allocating qtmp2 "//errorMessage
     605           0 :           stop 1
     606             :         endif
     607             : 
     608       39504 :         qtmp1 = 0 ! May contain empty (unset) parts
     609       39504 :         qtmp2 = 0 ! Not really needed
     610             : 
     611       39504 :         if (useGPU) then
     612           0 :           successCUDA = cuda_malloc(qtmp1_dev, gemm_dim_k * gemm_dim_l * size_of_datatype)
     613           0 :           check_alloc_cuda("merge_systems: qtmp1_dev", successCUDA)
     614             : 
     615           0 :           successCUDA = cuda_malloc(ev_dev, gemm_dim_l * gemm_dim_m * size_of_datatype)
     616           0 :           check_alloc_cuda("merge_systems: ev_dev", successCUDA)
     617             : 
     618           0 :           successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
     619           0 :           check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
     620             : 
     621           0 :           successCUDA = cuda_memset(qtmp1_dev, 0, gemm_dim_k * gemm_dim_l * size_of_datatype)
     622           0 :           check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
     623             : 
     624           0 :           successCUDA = cuda_memset(qtmp2_dev, 0, gemm_dim_k * gemm_dim_m * size_of_datatype)
     625           0 :           check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
     626             :         endif
     627             : 
     628             :         ! Gather nonzero upper/lower components of old matrix Q
     629             :         ! which are needed for multiplication with new eigenvectors
     630             : 
     631       39504 :         nnzu = 0
     632       39504 :         nnzl = 0
     633     8182944 :         do i = 1, na1
     634     8143440 :           l_idx = l_col(idx1(i))
     635     8143440 :           if (p_col(idx1(i))==my_pcol) then
     636     8143440 :             if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
     637     3914160 :               nnzu = nnzu+1
     638     3914160 :               qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
     639             :             endif
     640     8143440 :             if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
     641     4261776 :               nnzl = nnzl+1
     642     4261776 :               qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
     643             :             endif
     644             :           endif
     645             :         enddo
     646             : 
     647             :         ! Gather deflated eigenvalues behind nonzero components
     648             : 
     649       39504 :         ndef = max(nnzu,nnzl)
     650     1004064 :         do i = 1, na2
     651      964560 :           l_idx = l_col(idx2(i))
     652      964560 :           if (p_col(idx2(i))==my_pcol) then
     653      964560 :             ndef = ndef+1
     654      964560 :             qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
     655             :           endif
     656             :         enddo
     657             : 
     658       39504 :         l_cols_qreorg = ndef ! Number of columns in reorganized matrix
     659             : 
     660             :         ! Set (output) Q to 0, it will sum up new Q
     661             : 
     662     9147504 :         DO i = 1, na
     663     9108000 :           if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
     664             :         enddo
     665             : 
     666       39504 :         np_rem = my_pcol
     667             : 
     668       79008 :         do np = 1, npc_n
     669             :           ! Do a ring send of qtmp1
     670             : 
     671       39504 :           if (np>1) then
     672             : 
     673           0 :             if (np_rem==npc_0) then
     674           0 :               np_rem = npc_0+npc_n-1
     675             :             else
     676           0 :               np_rem = np_rem-1
     677             :             endif
     678             : #ifdef WITH_MPI
     679           0 :             call obj%timer%start("mpi_communication")
     680             :             call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL_PRECISION, &
     681             :                                         np_next, 1111, np_prev, 1111, &
     682           0 :                                         mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
     683           0 :             call obj%timer%stop("mpi_communication")
     684             : #endif /* WITH_MPI */
     685             :           endif
     686             : 
     687       39504 :           if (useGPU) then
     688             :             successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
     689           0 :                  gemm_dim_k * gemm_dim_l  * size_of_datatype, cudaMemcpyHostToDevice)
     690           0 :             check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
     691             :           endif
     692             : 
     693             :           ! Gather the parts in d1 and z which are fitting to qtmp1.
     694             :           ! This also delivers nnzu/nnzl for proc np_rem
     695             : 
     696       39504 :           nnzu = 0
     697       39504 :           nnzl = 0
     698     8182944 :           do i=1,na1
     699     8143440 :             if (p_col(idx1(i))==np_rem) then
     700     8143440 :               if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
     701     3914160 :                 nnzu = nnzu+1
     702     3914160 :                 d1u(nnzu) = d1(i)
     703     3914160 :                 zu (nnzu) = z (i)
     704             :               endif
     705     8143440 :               if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
     706     4261776 :                 nnzl = nnzl+1
     707     4261776 :                 d1l(nnzl) = d1(i)
     708     4261776 :                 zl (nnzl) = z (i)
     709             :               endif
     710             :             endif
     711             :           enddo
     712             : 
     713             :           ! Set the deflated eigenvectors in Q (comming from proc np_rem)
     714             : 
     715       39504 :           ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix
     716     9147504 :           do i = 1, na
     717     9108000 :             j = idx(i)
     718     9108000 :             if (j>na1) then
     719      964560 :               if (p_col(idx2(j-na1))==np_rem) then
     720      964560 :                 ndef = ndef+1
     721      964560 :                 if (p_col_out(i)==my_pcol) &
     722      709296 :                       q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
     723             :               endif
     724             :             endif
     725             :           enddo
     726             : 
     727      114624 :           do ns = 0, nqcols1-1, max_strip ! strimining loop
     728             : 
     729       75120 :             ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
     730             : 
     731             :             ! Get partial result from (output) Q
     732             : 
     733     6601824 :             do i = 1, ncnt
     734     6526704 :               qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns)))
     735             :             enddo
     736             : 
     737             :             ! Compute eigenvectors of the rank-1 modified matrix.
     738             :             ! Parts for multiplying with upper half of Q:
     739             : 
     740       75120 :             call obj%timer%start("strange_loop")
     741     6601824 :             do i = 1, ncnt
     742     6526704 :               j = idx(idxq1(i+ns))
     743             :               ! Calculate the j-th eigenvector of the deflated system
     744             :               ! See above why we are doing it this way!
     745     6526704 :               tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
     746             :               call v_add_s_&
     747             :               &PRECISION&
     748     6526704 :               &(obj,tmp,nnzu,ddiff(j))
     749     6526704 :               ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
     750             :             enddo
     751       75120 :             call obj%timer%stop("strange_loop")
     752             : 
     753       75120 :             if(useGPU) then
     754             :               !TODO: it should be enough to copy l_rows x ncnt
     755             :               successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
     756           0 :                                  gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
     757           0 :               check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
     758             : 
     759             :               !TODO the previous loop could be possible to do on device and thus
     760             :               !copy less
     761             :               successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
     762           0 :                                  gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
     763           0 :               check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
     764             :             endif
     765             : 
     766             :             ! Multiply old Q with eigenvectors (upper half)
     767             : 
     768       75120 :             if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
     769       75120 :               if (useGPU) then
     770           0 :                 call obj%timer%start("cublas")
     771             :                 call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu,   &
     772             :                                     1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1),    &
     773             :                                     ev_dev, ubound(ev,dim=1), &
     774           0 :                                     1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1))
     775           0 :                 call obj%timer%stop("cublas")
     776             :               else
     777       75120 :                 call obj%timer%start("blas")
     778       75120 :                 call obj%timer%start("gemm")
     779             :                 call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu,   &
     780             :                                     1.0_rk, qtmp1, ubound(qtmp1,dim=1),    &
     781             :                                     ev, ubound(ev,dim=1), &
     782       75120 :                                     1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
     783       75120 :                 call obj%timer%stop("gemm")
     784       75120 :                 call obj%timer%stop("blas")
     785             :               endif ! useGPU
     786             :             endif
     787             : 
     788             :             if(useGPU) then
     789             :               !TODO: it should be enough to copy l_rows x ncnt
     790             :               !TODO: actually this will be done after the second mutiplication
     791             : 
     792             :               !TODO or actually maybe I should copy the half of the qtmp2 array
     793             :               !here and the rest after the next gemm
     794             :               !TODO either copy only half of the matrix here, and half after the
     795             :               !second gemm, or copy whole array after the next gemm
     796             : 
     797             : !              successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
     798             : !                                 gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
     799             : !              check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
     800             :             endif
     801             : 
     802             :             ! Compute eigenvectors of the rank-1 modified matrix.
     803             :             ! Parts for multiplying with lower half of Q:
     804             : 
     805       75120 :             call obj%timer%start("strange_loop")
     806     6601824 :             do i = 1, ncnt
     807     6526704 :               j = idx(idxq1(i+ns))
     808             :               ! Calculate the j-th eigenvector of the deflated system
     809             :               ! See above why we are doing it this way!
     810     6526704 :               tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
     811             :               call v_add_s_&
     812             :               &PRECISION&
     813     6526704 :               &(obj,tmp,nnzl,ddiff(j))
     814     6526704 :               ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
     815             :             enddo
     816       75120 :             call obj%timer%stop("strange_loop")
     817             : 
     818       75120 :             if(useGPU) then
     819             :               !TODO the previous loop could be possible to do on device and thus
     820             :               !copy less
     821             :               successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
     822           0 :                                  gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
     823           0 :               check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
     824             :             endif
     825             : 
     826             :             ! Multiply old Q with eigenvectors (lower half)
     827             : 
     828       75120 :             if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
     829       75120 :               if (useGPU) then
     830           0 :                 call obj%timer%start("cublas")
     831             :                 call cublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl,   &
     832             :                                     1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1),    &
     833             :                                     ev_dev, ubound(ev,dim=1), &
     834           0 :                                     1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1))
     835           0 :                 call obj%timer%stop("cublas")
     836             :               else
     837       75120 :                 call obj%timer%start("blas")
     838       75120 :                 call obj%timer%start("gemm")
     839             :                 call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl,   &
     840             :                                      1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1),    &
     841             :                                      ev,  ubound(ev,dim=1),   &
     842       75120 :                                      1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
     843       75120 :                 call obj%timer%stop("gemm")
     844       75120 :                 call obj%timer%stop("blas")
     845             :               endif ! useGPU
     846             :             endif
     847             : 
     848       75120 :             if(useGPU) then
     849             :               !TODO either copy only half of the matrix here, and get rid of the
     850             :               !previous copy or copy whole array here
     851             :               successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
     852           0 :                                  gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
     853           0 :               check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
     854             :             endif
     855             : 
     856             :              ! Put partial result into (output) Q
     857             : 
     858     6601824 :             do i = 1, ncnt
     859     6526704 :               q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
     860             :             enddo
     861             : 
     862             :           enddo   !ns = 0, nqcols1-1, max_strip ! strimining loop
     863             :         enddo    !do np = 1, npc_n
     864             : 
     865       39504 :         deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
     866       39504 :         if (istat .ne. 0) then
     867           0 :           print *,"merge_systems: error when deallocating ev "//errorMessage
     868           0 :           stop 1
     869             :         endif
     870             : 
     871       39504 :         if(useGPU) then
     872           0 :           successCUDA = cuda_free(qtmp1_dev)
     873           0 :           check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA)
     874           0 :           successCUDA = cuda_free(qtmp2_dev)
     875           0 :           check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA)
     876           0 :           successCUDA = cuda_free(ev_dev)
     877           0 :           check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
     878             :         endif
     879             : 
     880             :       endif !very outer test (na1==1 .or. na1==2) 
     881             : #ifdef WITH_OPENMP
     882       19752 :       deallocate(z_p, stat=istat, errmsg=errorMessage)
     883       19752 :       if (istat .ne. 0) then
     884           0 :         print *,"merge_systems: error when deallocating z_p "//errorMessage
     885           0 :         stop 1
     886             :       endif
     887             : #endif
     888             : 
     889       39504 :       call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
     890             : 
     891      138264 :       return
     892             : 
     893             :       contains
     894             :         subroutine add_tmp_&
     895     4612104 :         &PRECISION&
     896     4612104 :         &(obj, d1, dbase, ddiff, z, ev_scale_value, na1,i)
     897             :           use precision
     898             :     use elpa_abstract_impl
     899             :           implicit none
     900             :           class(elpa_abstract_impl_t), intent(inout) :: obj
     901             :           integer(kind=ik), intent(in) :: na1, i
     902             : 
     903             :           real(kind=REAL_DATATYPE), intent(in)    :: d1(:), dbase(:), ddiff(:), z(:)
     904             :           real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value
     905     9224208 :           real(kind=REAL_DATATYPE)                :: tmp(1:na1)
     906             : 
     907             :                ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
     908             :                ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
     909             : 
     910             :                ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
     911             :                ! in exactly this order, but we want to prevent compiler optimization
     912             : 
     913     4612104 :           tmp(1:na1) = d1(1:na1) -dbase(i)
     914             :           call v_add_s_&
     915             :           &PRECISION&
     916     4612104 :           &(obj, tmp(1:na1),na1,ddiff(i))
     917     4612104 :           tmp(1:na1) = z(1:na1) / tmp(1:na1)
     918     4612104 :           ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
     919             : 
     920             :         end subroutine add_tmp_&
     921     4612104 :         &PRECISION
     922             : 
     923             :         subroutine resort_ev_&
     924           0 :         &PRECISION&
     925           0 :         &(obj, idx_ev, nLength)
     926             :           use precision
     927             :     use elpa_abstract_impl
     928             :           implicit none
     929             :           class(elpa_abstract_impl_t), intent(inout) :: obj
     930             :           integer(kind=ik), intent(in) :: nLength
     931             :           integer(kind=ik)             :: idx_ev(nLength)
     932             :           integer(kind=ik)             :: i, nc, pc1, pc2, lc1, lc2, l_cols_out
     933             : 
     934           0 :           real(kind=REAL_DATATYPE), allocatable   :: qtmp(:,:)
     935             :           integer(kind=ik)             :: istat
     936             :           character(200)               :: errorMessage
     937             : 
     938           0 :           if (l_rows==0) return ! My processor column has no work to do
     939             : 
     940             :           ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i))
     941             : 
     942           0 :           l_cols_out = COUNT(p_col_out(1:na)==my_pcol)
     943           0 :           allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage)
     944           0 :           if (istat .ne. 0) then
     945           0 :             print *,"resort_ev: error when allocating qtmp "//errorMessage
     946           0 :             stop 1
     947             :           endif
     948             : 
     949           0 :           nc = 0
     950             : 
     951           0 :           do i=1,na
     952             : 
     953           0 :             pc1 = p_col(idx_ev(i))
     954           0 :             lc1 = l_col(idx_ev(i))
     955           0 :             pc2 = p_col_out(i)
     956             : 
     957           0 :             if (pc2<0) cycle ! This column is not needed in output
     958             : 
     959           0 :             if (pc2==my_pcol) nc = nc+1 ! Counter for output columns
     960             : 
     961           0 :             if (pc1==my_pcol) then
     962           0 :               if (pc2==my_pcol) then
     963             :                 ! send and recieve column are local
     964           0 :                 qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
     965             :               else
     966             : #ifdef WITH_MPI
     967           0 :                 call obj%timer%start("mpi_communication")
     968           0 :                 call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, mod(i,4096), mpi_comm_cols, mpierr)
     969           0 :                 call obj%timer%stop("mpi_communication")
     970             : #endif /* WITH_MPI */
     971             :               endif
     972           0 :             else if (pc2==my_pcol) then
     973             : #ifdef WITH_MPI
     974           0 :               call obj%timer%start("mpi_communication")
     975           0 :               call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL_PRECISION, pc1, mod(i,4096), mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
     976           0 :               call obj%timer%stop("mpi_communication")
     977             : #else /* WITH_MPI */
     978           0 :               qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
     979             : #endif /* WITH_MPI */
     980             :             endif
     981             :           enddo
     982             : 
     983             :           ! Insert qtmp into (output) q
     984             : 
     985           0 :           nc = 0
     986             : 
     987           0 :           do i=1,na
     988             : 
     989           0 :             pc2 = p_col_out(i)
     990           0 :             lc2 = l_col_out(i)
     991             : 
     992           0 :             if (pc2==my_pcol) then
     993           0 :               nc = nc+1
     994           0 :               q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc)
     995             :             endif
     996             :           enddo
     997             : 
     998           0 :           deallocate(qtmp, stat=istat, errmsg=errorMessage)
     999           0 :           if (istat .ne. 0) then
    1000           0 :             print *,"resort_ev: error when deallocating qtmp "//errorMessage
    1001           0 :             stop 1
    1002             :           endif
    1003             :         end subroutine resort_ev_&
    1004           0 :         &PRECISION
    1005             : 
    1006             :         subroutine transform_columns_&
    1007       45024 :         &PRECISION&
    1008             :         &(obj, col1, col2)
    1009             :           use precision
    1010             :     use elpa_abstract_impl
    1011             :           implicit none
    1012             :           class(elpa_abstract_impl_t), intent(inout) :: obj
    1013             : 
    1014             :           integer(kind=ik)           :: col1, col2
    1015             :           integer(kind=ik)           :: pc1, pc2, lc1, lc2
    1016             : 
    1017       45024 :           if (l_rows==0) return ! My processor column has no work to do
    1018             : 
    1019       45024 :           pc1 = p_col(col1)
    1020       45024 :           lc1 = l_col(col1)
    1021       45024 :           pc2 = p_col(col2)
    1022       45024 :           lc2 = l_col(col2)
    1023             : 
    1024       45024 :           if (pc1==my_pcol) then
    1025       45024 :             if (pc2==my_pcol) then
    1026             :               ! both columns are local
    1027       45024 :               tmp(1:l_rows)      = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1)
    1028       45024 :               q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
    1029       45024 :               q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
    1030             :             else
    1031             : #ifdef WITH_MPI
    1032           0 :               call obj%timer%start("mpi_communication")
    1033             :               call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, 1, &
    1034             :                                 tmp, l_rows, MPI_REAL_PRECISION, pc2, 1,          &
    1035           0 :                                 mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
    1036           0 :               call obj%timer%stop("mpi_communication")
    1037             : #else /* WITH_MPI */
    1038           0 :               tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
    1039             : #endif /* WITH_MPI */
    1040           0 :               q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
    1041             :             endif
    1042           0 :           else if (pc2==my_pcol) then
    1043             : #ifdef WITH_MPI
    1044           0 :             call obj%timer%start("mpi_communication")
    1045             :             call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL_PRECISION, pc1, 1, &
    1046             :                                tmp, l_rows, MPI_REAL_PRECISION, pc1, 1,         &
    1047           0 :                                mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
    1048           0 :             call obj%timer%stop("mpi_communication")
    1049             : #else /* WITH_MPI */
    1050           0 :             tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
    1051             : #endif /* WITH_MPI */
    1052             : 
    1053           0 :             q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
    1054             :           endif
    1055             :         end subroutine transform_columns_&
    1056             :         &PRECISION
    1057             : 
    1058             :         subroutine global_gather_&
    1059      158016 :         &PRECISION&
    1060      158016 :         &(obj, z, n)
    1061             :           ! This routine sums up z over all processors.
    1062             :           ! It should only be used for gathering distributed results,
    1063             :           ! i.e. z(i) should be nonzero on exactly 1 processor column,
    1064             :           ! otherways the results may be numerically different on different columns
    1065             :           use precision
    1066             :           use elpa_abstract_impl
    1067             :           implicit none
    1068             :           class(elpa_abstract_impl_t), intent(inout) :: obj
    1069             :           integer(kind=ik)            :: n
    1070             :           real(kind=REAL_DATATYPE)    :: z(n)
    1071      316032 :           real(kind=REAL_DATATYPE)    :: tmp(n)
    1072             : 
    1073      158016 :           if (npc_n==1 .and. np_rows==1) return ! nothing to do
    1074             : 
    1075             :           ! Do an mpi_allreduce over processor rows
    1076             : #ifdef WITH_MPI
    1077      153024 :           call obj%timer%start("mpi_communication")
    1078      153024 :           call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
    1079      153024 :           call obj%timer%stop("mpi_communication")
    1080             : #else /* WITH_MPI */
    1081           0 :           tmp = z
    1082             : #endif /* WITH_MPI */
    1083             :           ! If only 1 processor column, we are done
    1084      153024 :           if (npc_n==1) then
    1085      153024 :             z(:) = tmp(:)
    1086      153024 :             return
    1087             :           endif
    1088             : 
    1089             :           ! If all processor columns are involved, we can use mpi_allreduce
    1090           0 :           if (npc_n==np_cols) then
    1091             : #ifdef WITH_MPI
    1092           0 :             call obj%timer%start("mpi_communication")
    1093           0 :             call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
    1094           0 :             call obj%timer%stop("mpi_communication")
    1095             : #else /* WITH_MPI */
    1096           0 :             tmp = z
    1097             : #endif /* WITH_MPI */
    1098             : 
    1099           0 :             return
    1100             :           endif
    1101             : 
    1102             :           ! Do a ring send over processor columns
    1103           0 :           z(:) = 0
    1104           0 :           do np = 1, npc_n
    1105           0 :             z(:) = z(:) + tmp(:)
    1106             : #ifdef WITH_MPI
    1107           0 :             call obj%timer%start("mpi_communication")
    1108             :             call MPI_Sendrecv_replace(z, n, MPI_REAL_PRECISION, np_next, 1111, np_prev, 1111, &
    1109           0 :                                        mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
    1110           0 :             call obj%timer%stop("mpi_communication")
    1111             : #endif /* WITH_MPI */
    1112             :           enddo
    1113             :         end subroutine global_gather_&
    1114           0 :         &PRECISION
    1115             : 
    1116             :         subroutine global_product_&
    1117       39504 :         &PRECISION&
    1118       39504 :         &(obj, z, n)
    1119             :           ! This routine calculates the global product of z.
    1120             :           use precision
    1121             :           use elpa_abstract_impl
    1122             :           implicit none
    1123             :           class(elpa_abstract_impl_t), intent(inout) :: obj
    1124             : 
    1125             : 
    1126             :           integer(kind=ik)            :: n
    1127             :           real(kind=REAL_DATATYPE)    :: z(n)
    1128             : 
    1129       79008 :           real(kind=REAL_DATATYPE)    :: tmp(n)
    1130             : 
    1131       39504 :           if (npc_n==1 .and. np_rows==1) return ! nothing to do
    1132             : 
    1133             :           ! Do an mpi_allreduce over processor rows
    1134             : #ifdef WITH_MPI
    1135       38256 :           call obj%timer%start("mpi_communication")
    1136       38256 :           call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_rows, mpierr)
    1137       38256 :           call obj%timer%stop("mpi_communication")
    1138             : #else /* WITH_MPI */
    1139           0 :           tmp = z
    1140             : #endif /* WITH_MPI */
    1141             :           ! If only 1 processor column, we are done
    1142       38256 :           if (npc_n==1) then
    1143       38256 :             z(:) = tmp(:)
    1144       38256 :             return
    1145             :           endif
    1146             : 
    1147             :           ! If all processor columns are involved, we can use mpi_allreduce
    1148           0 :           if (npc_n==np_cols) then
    1149             : #ifdef WITH_MPI
    1150           0 :             call obj%timer%start("mpi_communication")
    1151           0 :             call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_cols, mpierr)
    1152           0 :             call obj%timer%stop("mpi_communication")
    1153             : #else /* WITH_MPI */
    1154           0 :             z = tmp
    1155             : #endif /* WITH_MPI */
    1156           0 :             return
    1157             :           endif
    1158             : 
    1159             :           ! We send all vectors to the first proc, do the product there
    1160             :           ! and redistribute the result.
    1161             : 
    1162           0 :           if (my_pcol == npc_0) then
    1163           0 :             z(1:n) = tmp(1:n)
    1164           0 :             do np = npc_0+1, npc_0+npc_n-1
    1165             : #ifdef WITH_MPI
    1166           0 :               call obj%timer%start("mpi_communication")
    1167           0 :               call mpi_recv(tmp, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
    1168           0 :               call obj%timer%stop("mpi_communication")
    1169             : #else  /* WITH_MPI */
    1170           0 :               tmp(1:n) = z(1:n)
    1171             : #endif  /* WITH_MPI */
    1172           0 :               z(1:n) = z(1:n)*tmp(1:n)
    1173             :             enddo
    1174           0 :             do np = npc_0+1, npc_0+npc_n-1
    1175             : #ifdef WITH_MPI
    1176           0 :               call obj%timer%start("mpi_communication")
    1177           0 :               call mpi_send(z, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, mpierr)
    1178           0 :               call obj%timer%stop("mpi_communication")
    1179             : #endif  /* WITH_MPI */
    1180             :             enddo
    1181             :           else
    1182             : #ifdef WITH_MPI
    1183           0 :             call obj%timer%start("mpi_communication")
    1184           0 :             call mpi_send(tmp, n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, mpierr)
    1185           0 :             call mpi_recv(z  ,n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
    1186           0 :             call obj%timer%stop("mpi_communication")
    1187             : #else  /* WITH_MPI */
    1188           0 :             z(1:n) = tmp(1:n)
    1189             : #endif  /* WITH_MPI */
    1190             : 
    1191             :           endif
    1192             :         end subroutine global_product_&
    1193           0 :         &PRECISION
    1194             : 
    1195             :         subroutine check_monotony_&
    1196      197520 :         &PRECISION&
    1197      197520 :         &(obj, n,d,text, wantDebug, success)
    1198             :         ! This is a test routine for checking if the eigenvalues are monotonically increasing.
    1199             :         ! It is for debug purposes only, an error should never be triggered!
    1200             :           use precision
    1201             :           use elpa_abstract_impl
    1202             :           implicit none
    1203             : 
    1204             :           class(elpa_abstract_impl_t), intent(inout) :: obj
    1205             :           integer(kind=ik)              :: n
    1206             :           real(kind=REAL_DATATYPE)      :: d(n)
    1207             :           character*(*)                 :: text
    1208             : 
    1209             :           integer(kind=ik)              :: i
    1210             :           logical, intent(in)           :: wantDebug
    1211             :           logical, intent(out)          :: success
    1212             : 
    1213      197520 :           success = .true.
    1214    27333552 :           do i=1,n-1
    1215    27136032 :             if (d(i+1)<d(i)) then
    1216           0 :               if (wantDebug) write(error_unit,'(a,a,i8,2g25.17)') 'ELPA1_check_monotony: Monotony error on ',text,i,d(i),d(i+1)
    1217           0 :               success = .false.
    1218           0 :               return
    1219             :             endif
    1220             :           enddo
    1221             :         end subroutine check_monotony_&
    1222      197520 :         &PRECISION
    1223             : 
    1224             :     end subroutine merge_systems_&
    1225             :     &PRECISION

Generated by: LCOV version 1.12