LCOV - code coverage report
Current view: top level - src/elpa1 - elpa1_solve_tridi_real_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 137 286 47.9 %
Date: 2018-01-10 09:29:53 Functions: 12 60 20.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : !
      47             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      48             : !
      49             : ! Copyright of the original code rests with the authors inside the ELPA
      50             : ! consortium. The copyright of any additional modifications shall rest
      51             : ! with their original authors, but shall adhere to the licensing terms
      52             : ! distributed along with the original code in the file "COPYING".
      53             : #endif
      54             : 
      55             : #include "../general/sanity.F90"
      56             : 
      57             : subroutine solve_tridi_&
      58       57384 : &PRECISION_AND_SUFFIX &
      59       57384 :     ( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
      60             :                                            mpi_comm_cols, useGPU, wantDebug, success )
      61             : 
      62             :       use precision
      63             :       use elpa_abstract_impl
      64             :       implicit none
      65             : #include "../../src/general/precision_kinds.F90"
      66             :       class(elpa_abstract_impl_t), intent(inout) :: obj
      67             :       integer(kind=ik), intent(in)              :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      68             :       real(kind=REAL_DATATYPE), intent(inout)   :: d(na), e(na)
      69             : #ifdef USE_ASSUMED_SIZE
      70             :       real(kind=REAL_DATATYPE), intent(inout)   :: q(ldq,*)
      71             : #else
      72             :       real(kind=REAL_DATATYPE), intent(inout)   :: q(ldq,matrixCols)
      73             : #endif
      74             :       logical, intent(in)           :: useGPU, wantDebug
      75             :       logical, intent(out)          :: success
      76             : 
      77             :       integer(kind=ik)              :: i, j, n, np, nc, nev1, l_cols, l_rows
      78             :       integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      79             : 
      80       57384 :       integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
      81             : 
      82             :       integer(kind=ik)              :: istat
      83             :       character(200)                :: errorMessage
      84             : 
      85       57384 :       call obj%timer%start("solve_tridi" // PRECISION_SUFFIX)
      86             : 
      87       57384 :       call obj%timer%start("mpi_communication")
      88       57384 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      89       57384 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      90       57384 :       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      91       57384 :       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      92       57384 :       call obj%timer%stop("mpi_communication")
      93             : 
      94       57384 :       success = .true.
      95             : 
      96       57384 :       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
      97       57384 :       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
      98             : 
      99             :       ! Set Q to 0
     100       57384 :       q(1:l_rows, 1:l_cols) = 0.0_rk
     101             : 
     102             :       ! Get the limits of the subdivisons, each subdivison has as many cols
     103             :       ! as fit on the respective processor column
     104             : 
     105       57384 :       allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
     106       57384 :       if (istat .ne. 0) then
     107           0 :         print *,"solve_tridi: error when allocating limits "//errorMessage
     108           0 :         stop 1
     109             :       endif
     110             : 
     111       57384 :       limits(0) = 0
     112      114768 :       do np=0,np_cols-1
     113       57384 :         nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
     114             : 
     115             :         ! Check for the case that a column has have zero width.
     116             :         ! This is not supported!
     117             :         ! Scalapack supports it but delivers no results for these columns,
     118             :         ! which is rather annoying
     119       57384 :         if (nc==0) then
     120           0 :           call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
     121           0 :           if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
     122           0 :           success = .false.
     123           0 :           return
     124             :         endif
     125       57384 :         limits(np+1) = limits(np) + nc
     126             :       enddo
     127             : 
     128             :       ! Subdivide matrix by subtracting rank 1 modifications
     129             : 
     130       57384 :       do i=1,np_cols-1
     131           0 :         n = limits(i)
     132           0 :         d(n) = d(n)-abs(e(n))
     133           0 :         d(n+1) = d(n+1)-abs(e(n))
     134             :       enddo
     135             : 
     136             :       ! Solve sub problems on processsor columns
     137             : 
     138       57384 :       nc = limits(my_pcol) ! column after which my problem starts
     139             : 
     140       57384 :       if (np_cols>1) then
     141           0 :         nev1 = l_cols ! all eigenvectors are needed
     142             :       else
     143       57384 :         nev1 = MIN(nev,l_cols)
     144             :       endif
     145             :       call solve_tridi_col_&
     146             :            &PRECISION_AND_SUFFIX &
     147             :              (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
     148       57384 :                         matrixCols, mpi_comm_rows, useGPU, wantDebug, success)
     149       57384 :       if (.not.(success)) then
     150           0 :         call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
     151           0 :         return
     152             :       endif
     153             :       ! If there is only 1 processor column, we are done
     154             : 
     155       57384 :       if (np_cols==1) then
     156       57384 :         deallocate(limits, stat=istat, errmsg=errorMessage)
     157       57384 :         if (istat .ne. 0) then
     158           0 :           print *,"solve_tridi: error when deallocating limits "//errorMessage
     159           0 :           stop 1
     160             :         endif
     161             : 
     162       57384 :         call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
     163       57384 :         return
     164             :       endif
     165             : 
     166             :       ! Set index arrays for Q columns
     167             : 
     168             :       ! Dense distribution scheme:
     169             : 
     170           0 :       allocate(l_col(na), stat=istat, errmsg=errorMessage)
     171           0 :       if (istat .ne. 0) then
     172           0 :         print *,"solve_tridi: error when allocating l_col "//errorMessage
     173           0 :         stop 1
     174             :       endif
     175             : 
     176           0 :       allocate(p_col(na), stat=istat, errmsg=errorMessage)
     177           0 :       if (istat .ne. 0) then
     178           0 :         print *,"solve_tridi: error when allocating p_col "//errorMessage
     179           0 :         stop 1
     180             :       endif
     181             : 
     182           0 :       n = 0
     183           0 :       do np=0,np_cols-1
     184           0 :         nc = local_index(na, np, np_cols, nblk, -1)
     185           0 :         do i=1,nc
     186           0 :           n = n+1
     187           0 :           l_col(n) = i
     188           0 :           p_col(n) = np
     189             :         enddo
     190             :       enddo
     191             : 
     192             :       ! Block cyclic distribution scheme, only nev columns are set:
     193             : 
     194           0 :       allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
     195           0 :       if (istat .ne. 0) then
     196           0 :         print *,"solve_tridi: error when allocating l_col_bc "//errorMessage
     197           0 :         stop 1
     198             :       endif
     199             : 
     200           0 :       allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
     201           0 :       if (istat .ne. 0) then
     202           0 :         print *,"solve_tridi: error when allocating p_col_bc "//errorMessage
     203           0 :         stop 1
     204             :       endif
     205             : 
     206           0 :       p_col_bc(:) = -1
     207           0 :       l_col_bc(:) = -1
     208             : 
     209           0 :       do i = 0, na-1, nblk*np_cols
     210           0 :         do j = 0, np_cols-1
     211           0 :           do n = 1, nblk
     212           0 :             if (i+j*nblk+n <= MIN(nev,na)) then
     213           0 :               p_col_bc(i+j*nblk+n) = j
     214           0 :               l_col_bc(i+j*nblk+n) = i/np_cols + n
     215             :              endif
     216             :            enddo
     217             :          enddo
     218             :       enddo
     219             : 
     220             :       ! Recursively merge sub problems
     221             :       call merge_recursive_&
     222             :            &PRECISION &
     223           0 :            (obj, 0, np_cols, useGPU, wantDebug, success)
     224           0 :       if (.not.(success)) then
     225           0 :         call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
     226           0 :         return
     227             :       endif
     228             : 
     229           0 :       deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
     230           0 :       if (istat .ne. 0) then
     231           0 :         print *,"solve_tridi: error when deallocating l_col "//errorMessage
     232           0 :         stop 1
     233             :       endif
     234             : 
     235           0 :       call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
     236           0 :       return
     237             : 
     238             :       contains
     239             :         recursive subroutine merge_recursive_&
     240           0 :                   &PRECISION &
     241             :            (obj, np_off, nprocs, useGPU, wantDebug, success)
     242             :            use precision
     243             :            use elpa_abstract_impl
     244             :            implicit none
     245             : 
     246             :            ! noff is always a multiple of nblk_ev
     247             :            ! nlen-noff is always > nblk_ev
     248             : 
     249             :            class(elpa_abstract_impl_t), intent(inout) :: obj
     250             :            integer(kind=ik)     :: np_off, nprocs
     251             :            integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n
     252             : #ifdef WITH_MPI
     253             : !           integer(kind=ik)     :: my_mpi_status(mpi_status_size)
     254             : #endif
     255             :            logical, intent(in)  :: useGPU, wantDebug
     256             :            logical, intent(out) :: success
     257             : 
     258           0 :            success = .true.
     259             : 
     260           0 :            if (nprocs<=1) then
     261             :              ! Safety check only
     262           0 :              if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
     263           0 :              success = .false.
     264           0 :              return
     265             :            endif
     266             :            ! Split problem into 2 subproblems of size np1 / np2
     267             : 
     268           0 :            np1 = nprocs/2
     269           0 :            np2 = nprocs-np1
     270             : 
     271           0 :            if (np1 > 1) call merge_recursive_&
     272             :                         &PRECISION &
     273           0 :            (obj, np_off, np1, useGPU, wantDebug, success)
     274           0 :            if (.not.(success)) return
     275           0 :            if (np2 > 1) call merge_recursive_&
     276             :                         &PRECISION &
     277           0 :            (obj, np_off+np1, np2, useGPU, wantDebug, success)
     278           0 :            if (.not.(success)) return
     279             : 
     280           0 :            noff = limits(np_off)
     281           0 :            nmid = limits(np_off+np1) - noff
     282           0 :            nlen = limits(np_off+nprocs) - noff
     283             : 
     284             : #ifdef WITH_MPI
     285           0 :            call obj%timer%start("mpi_communication")
     286           0 :            if (my_pcol==np_off) then
     287           0 :              do n=np_off+np1,np_off+nprocs-1
     288           0 :                call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
     289             :              enddo
     290             :            endif
     291           0 :            call obj%timer%stop("mpi_communication")
     292             : #endif /* WITH_MPI */
     293             : 
     294           0 :            if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
     295             : #ifdef WITH_MPI
     296           0 :              call obj%timer%start("mpi_communication")
     297           0 :              call mpi_recv(d(noff+1), nmid, MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
     298           0 :              call obj%timer%stop("mpi_communication")
     299             : #else /* WITH_MPI */
     300             : !             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
     301             : #endif /* WITH_MPI */
     302             :            endif
     303             : 
     304           0 :            if (my_pcol==np_off+np1) then
     305           0 :              do n=np_off,np_off+np1-1
     306             : #ifdef WITH_MPI
     307           0 :                call obj%timer%start("mpi_communication")
     308           0 :                call mpi_send(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
     309           0 :                call obj%timer%stop("mpi_communication")
     310             : #endif /* WITH_MPI */
     311             : 
     312             :              enddo
     313             :            endif
     314           0 :            if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
     315             : #ifdef WITH_MPI
     316           0 :              call obj%timer%start("mpi_communication")
     317           0 :              call mpi_recv(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
     318           0 :              call obj%timer%stop("mpi_communication")
     319             : #else /* WITH_MPI */
     320             : !             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
     321             : #endif /* WITH_MPI */
     322             :            endif
     323           0 :            if (nprocs == np_cols) then
     324             : 
     325             :              ! Last merge, result distribution must be block cyclic, noff==0,
     326             :              ! p_col_bc is set so that only nev eigenvalues are calculated
     327             :              call merge_systems_&
     328             :                   &PRECISION &
     329             :                                  (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
     330             :                                  nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
     331           0 :                                  l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success )
     332           0 :              if (.not.(success)) return
     333             :            else
     334             :              ! Not last merge, leave dense column distribution
     335             :              call merge_systems_&
     336             :                   &PRECISION &
     337             :                                 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
     338             :                                  nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
     339           0 :                                  l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success )
     340           0 :              if (.not.(success)) return
     341             :            endif
     342             :        end subroutine merge_recursive_&
     343             :            &PRECISION
     344             : 
     345             :     end subroutine solve_tridi_&
     346             :         &PRECISION_AND_SUFFIX
     347             : 
     348             :     subroutine solve_tridi_col_&
     349       57384 :     &PRECISION_AND_SUFFIX &
     350       57384 :       ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success )
     351             : 
     352             :    ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
     353             :    ! with the divide and conquer method.
     354             :    ! Works best if the number of processor rows is a power of 2!
     355             :       use precision
     356             :       use elpa_abstract_impl
     357             :       implicit none
     358             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     359             : 
     360             :       integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
     361             :       real(kind=REAL_DATATYPE)      :: d(na), e(na)
     362             : #ifdef USE_ASSUMED_SIZE
     363             :       real(kind=REAL_DATATYPE)      :: q(ldq,*)
     364             : #else
     365             :       real(kind=REAL_DATATYPE)      :: q(ldq,matrixCols)
     366             : #endif
     367             : 
     368             :       integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
     369             : 
     370       57384 :       real(kind=REAL_DATATYPE), allocatable    :: qmat1(:,:), qmat2(:,:)
     371             :       integer(kind=ik)              :: i, n, np
     372             :       integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
     373             :       integer(kind=ik)              :: my_prow, np_rows, mpierr
     374             : 
     375       57384 :       integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
     376             :       logical, intent(in)           :: useGPU, wantDebug
     377             :       logical, intent(out)          :: success
     378             :       integer(kind=ik)              :: istat
     379             :       character(200)                :: errorMessage
     380             : 
     381       57384 :       call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
     382       57384 :       call obj%timer%start("mpi_communication")
     383       57384 :       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     384       57384 :       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     385       57384 :       call obj%timer%stop("mpi_communication")
     386       57384 :       success = .true.
     387             :       ! Calculate the number of subdivisions needed.
     388             : 
     389       57384 :       n = na
     390       57384 :       ndiv = 1
     391      133896 :       do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
     392       38256 :         n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
     393       38256 :         ndiv = ndiv*2
     394             :       enddo
     395             : 
     396             :       ! If there is only 1 processor row and not all eigenvectors are needed
     397             :       ! and the matrix size is big enough, then use 2 subdivisions
     398             :       ! so that merge_systems is called once and only the needed
     399             :       ! eigenvectors are calculated for the final problem.
     400             : 
     401       57384 :       if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
     402             : 
     403       57384 :       allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
     404       57384 :       if (istat .ne. 0) then
     405           0 :         print *,"solve_tridi_col: error when allocating limits "//errorMessage
     406           0 :         stop 1
     407             :       endif
     408             : 
     409       57384 :       limits(0) = 0
     410       57384 :       limits(ndiv) = na
     411             : 
     412       57384 :       n = ndiv
     413      136392 :       do while(n>1)
     414       39504 :         n = n/2 ! n is always a power of 2
     415       79008 :         do i=0,ndiv-1,2*n
     416             :           ! We want to have even boundaries (for cache line alignments)
     417       39504 :           limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
     418             :         enddo
     419             :       enddo
     420             : 
     421             :       ! Calculate the maximum size of a subproblem
     422             : 
     423       57384 :       max_size = 0
     424      154272 :       do i=1,ndiv
     425       96888 :         max_size = MAX(max_size,limits(i)-limits(i-1))
     426             :       enddo
     427             : 
     428             :       ! Subdivide matrix by subtracting rank 1 modifications
     429             : 
     430       96888 :       do i=1,ndiv-1
     431       39504 :         n = limits(i)
     432       39504 :         d(n) = d(n)-abs(e(n))
     433       39504 :         d(n+1) = d(n+1)-abs(e(n))
     434             :       enddo
     435             : 
     436       57384 :       if (np_rows==1)    then
     437             : 
     438             :         ! For 1 processor row there may be 1 or 2 subdivisions
     439       39504 :         do n=0,ndiv-1
     440       20376 :           noff = limits(n)        ! Start of subproblem
     441       20376 :           nlen = limits(n+1)-noff ! Size of subproblem
     442             : 
     443             :           call solve_tridi_single_problem_&
     444             :           &PRECISION_AND_SUFFIX &
     445             :                                   (obj, nlen,d(noff+1),e(noff+1), &
     446       20376 :                                     q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
     447             : 
     448       20376 :           if (.not.(success)) return
     449             :         enddo
     450             : 
     451             :       else
     452             : 
     453             :         ! Solve sub problems in parallel with solve_tridi_single
     454             :         ! There is at maximum 1 subproblem per processor
     455             : 
     456       38256 :         allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
     457       38256 :         if (istat .ne. 0) then
     458           0 :           print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage
     459           0 :           stop 1
     460             :         endif
     461             : 
     462       38256 :         allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
     463       38256 :         if (istat .ne. 0) then
     464           0 :           print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage
     465           0 :           stop 1
     466             :         endif
     467             : 
     468       38256 :         qmat1 = 0 ! Make sure that all elements are defined
     469             : 
     470       38256 :         if (my_prow < ndiv) then
     471             : 
     472       38256 :           noff = limits(my_prow)        ! Start of subproblem
     473       38256 :           nlen = limits(my_prow+1)-noff ! Size of subproblem
     474             :           call solve_tridi_single_problem_&
     475             :           &PRECISION_AND_SUFFIX &
     476             :                                     (obj, nlen,d(noff+1),e(noff+1),qmat1, &
     477       38256 :                                     ubound(qmat1,dim=1), wantDebug, success)
     478             : 
     479       38256 :           if (.not.(success)) return
     480             :         endif
     481             : 
     482             :         ! Fill eigenvectors in qmat1 into global matrix q
     483             : 
     484      114768 :         do np = 0, ndiv-1
     485             : 
     486       76512 :           noff = limits(np)
     487       76512 :           nlen = limits(np+1)-noff
     488             : #ifdef WITH_MPI
     489       76512 :           call obj%timer%start("mpi_communication")
     490       76512 :           call MPI_Bcast(d(noff+1), nlen, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
     491       76512 :           qmat2 = qmat1
     492       76512 :           call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
     493       76512 :           call obj%timer%stop("mpi_communication")
     494             : #else /* WITH_MPI */
     495             : !          qmat2 = qmat1 ! is this correct
     496             : #endif /* WITH_MPI */
     497     7936512 :           do i=1,nlen
     498             : 
     499             : #ifdef WITH_MPI
     500             :             call distribute_global_column_&
     501             :             &PRECISION &
     502     7860000 :                      (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
     503             : #else /* WITH_MPI */
     504             :             call distribute_global_column_&
     505             :             &PRECISION &
     506           0 :                      (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
     507             : #endif /* WITH_MPI */
     508             :           enddo
     509             : 
     510             :         enddo
     511             : 
     512       38256 :         deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
     513       38256 :         if (istat .ne. 0) then
     514           0 :           print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage
     515           0 :           stop 1
     516             :         endif
     517             : 
     518             :       endif
     519             : 
     520             :       ! Allocate and set index arrays l_col and p_col
     521             : 
     522       57384 :       allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errorMessage)
     523       57384 :       if (istat .ne. 0) then
     524           0 :         print *,"solve_tridi_col: error when allocating l_col "//errorMessage
     525           0 :         stop 1
     526             :       endif
     527             : 
     528    11847384 :       do i=1,na
     529    11790000 :         l_col(i) = i
     530    11790000 :         p_col_i(i) = 0
     531    11790000 :         p_col_o(i) = 0
     532             :       enddo
     533             : 
     534             :       ! Merge subproblems
     535             : 
     536       57384 :       n = 1
     537      136392 :       do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
     538             : 
     539       79008 :         do i=0,ndiv-1,2*n
     540             : 
     541       39504 :           noff = limits(i)
     542       39504 :           nmid = limits(i+n) - noff
     543       39504 :           nlen = limits(i+2*n) - noff
     544             : 
     545       39504 :           if (nlen == na) then
     546             :             ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
     547       39504 :             p_col_o(nev+1:na) = -1
     548             :           endif
     549             :           call merge_systems_&
     550             :           &PRECISION &
     551             :                               (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
     552             :                                matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
     553       39504 :                                l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success)
     554       39504 :           if (.not.(success)) return
     555             : 
     556             :         enddo
     557             : 
     558       39504 :         n = 2*n
     559             : 
     560             :       enddo
     561             : 
     562       57384 :       deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
     563       57384 :       if (istat .ne. 0) then
     564           0 :         print *,"solve_tridi_col: error when deallocating l_col "//errorMessage
     565           0 :         stop 1
     566             :       endif
     567             : 
     568       57384 :       call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
     569             : 
     570             :     end subroutine solve_tridi_col_&
     571       57384 :     &PRECISION_AND_SUFFIX
     572             : 
     573             :     recursive subroutine solve_tridi_single_problem_&
     574       58632 :     &PRECISION_AND_SUFFIX &
     575       58632 :     (obj, nlen, d, e, q, ldq, wantDebug, success)
     576             : 
     577             :    ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
     578             :    ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
     579             :      use precision
     580             :      use elpa_abstract_impl
     581             :      implicit none
     582             :      class(elpa_abstract_impl_t), intent(inout) :: obj
     583             :      integer(kind=ik)                         :: nlen, ldq
     584             :      real(kind=REAL_DATATYPE)                 :: d(nlen), e(nlen), q(ldq,nlen)
     585             : 
     586      117264 :      real(kind=REAL_DATATYPE), allocatable    :: work(:), qtmp(:), ds(:), es(:)
     587             :      real(kind=REAL_DATATYPE)                 :: dtmp
     588             : 
     589             :      integer(kind=ik)              :: i, j, lwork, liwork, info
     590       58632 :      integer(kind=ik), allocatable :: iwork(:)
     591             : 
     592             :      logical, intent(in)           :: wantDebug
     593             :      logical, intent(out)          :: success
     594             :       integer(kind=ik)             :: istat
     595             :       character(200)               :: errorMessage
     596             : 
     597       58632 :      call obj%timer%start("solve_tridi_single" // PRECISION_SUFFIX)
     598             : 
     599       58632 :      success = .true.
     600       58632 :      allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
     601       58632 :      if (istat .ne. 0) then
     602           0 :        print *,"solve_tridi_single: error when allocating ds "//errorMessage
     603           0 :        stop 1
     604             :      endif
     605             : 
     606             :      ! Save d and e for the case that dstedc fails
     607             : 
     608       58632 :      ds(:) = d(:)
     609       58632 :      es(:) = e(:)
     610             : 
     611             :      ! First try dstedc, this is normally faster but it may fail sometimes (why???)
     612             : 
     613       58632 :      lwork = 1 + 4*nlen + nlen**2
     614       58632 :      liwork =  3 + 5*nlen
     615       58632 :      allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
     616       58632 :      if (istat .ne. 0) then
     617           0 :        print *,"solve_tridi_single: error when allocating work "//errorMessage
     618           0 :        stop 1
     619             :      endif
     620       58632 :      call obj%timer%start("blas")
     621       58632 :      call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
     622       58632 :      call obj%timer%stop("blas")
     623             : 
     624       58632 :      if (info /= 0) then
     625             : 
     626             :        ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
     627             : 
     628           0 :        write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
     629             : 
     630           0 :        d(:) = ds(:)
     631           0 :        e(:) = es(:)
     632           0 :        call obj%timer%start("blas")
     633           0 :        call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
     634           0 :        call obj%timer%stop("blas")
     635             : 
     636             :        ! If DSTEQR fails also, we don't know what to do further ...
     637             : 
     638           0 :        if (info /= 0) then
     639           0 :          if (wantDebug) &
     640           0 :            write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
     641           0 :            success = .false.
     642           0 :            return
     643             :          endif
     644             :        end if
     645             : 
     646       58632 :        deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
     647       58632 :        if (istat .ne. 0) then
     648           0 :          print *,"solve_tridi_single: error when deallocating ds "//errorMessage
     649           0 :          stop 1
     650             :        endif
     651             : 
     652             :       ! Check if eigenvalues are monotonically increasing
     653             :       ! This seems to be not always the case  (in the IBM implementation of dstedc ???)
     654             : 
     655     7860000 :       do i=1,nlen-1
     656     7801368 :         if (d(i+1)<d(i)) then
     657             : #ifdef DOUBLE_PRECISION_REAL
     658           0 :           if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
     659             : #else
     660           0 :           if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
     661             : #endif
     662           0 :             write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
     663             :           else
     664           0 :             write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
     665           0 :             write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
     666           0 :             write(error_unit,'(a)') 'In this extent, this is completely harmless.'
     667           0 :             write(error_unit,'(a)') 'Still, we keep this info message just in case.'
     668             :           end if
     669           0 :           allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
     670           0 :           if (istat .ne. 0) then
     671           0 :             print *,"solve_tridi_single: error when allocating qtmp "//errorMessage
     672           0 :             stop 1
     673             :           endif
     674             : 
     675           0 :           dtmp = d(i+1)
     676           0 :           qtmp(1:nlen) = q(1:nlen,i+1)
     677           0 :           do j=i,1,-1
     678           0 :             if (dtmp<d(j)) then
     679           0 :               d(j+1)        = d(j)
     680           0 :               q(1:nlen,j+1) = q(1:nlen,j)
     681             :             else
     682           0 :               exit ! Loop
     683             :             endif
     684             :           enddo
     685           0 :           d(j+1)        = dtmp
     686           0 :           q(1:nlen,j+1) = qtmp(1:nlen)
     687           0 :           deallocate(qtmp, stat=istat, errmsg=errorMessage)
     688           0 :           if (istat .ne. 0) then
     689           0 :             print *,"solve_tridi_single: error when deallocating qtmp "//errorMessage
     690           0 :             stop 1
     691             :           endif
     692             : 
     693             :        endif
     694             :      enddo
     695       58632 :      call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
     696             : 
     697             :     end subroutine solve_tridi_single_problem_&
     698       58632 :     &PRECISION_AND_SUFFIX
     699             : 

Generated by: LCOV version 1.12