LCOV - code coverage report
Current view: top level - src/elpa2 - elpa2_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 154 283 54.4 %
Date: 2018-01-10 09:29:53 Functions: 4 4 100.0 %

          Line data    Source code
       1             : !    This file is part of ELPA.
       2             : !
       3             : !    The ELPA library was originally created by the ELPA consortium,
       4             : !    consisting of the following organizations:
       5             : !
       6             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : !      Informatik,
      10             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : !      and
      16             : !    - IBM Deutschland GmbH
      17             : !
      18             : !    This particular source code file contains additions, changes and
      19             : !    enhancements authored by Intel Corporation which is not part of
      20             : !    the ELPA consortium.
      21             : !
      22             : !    More information can be found here:
      23             : !    http://elpa.mpcdf.mpg.de/
      24             : !
      25             : !    ELPA is free software: you can redistribute it and/or modify
      26             : !    it under the terms of the version 3 of the license of the
      27             : !    GNU Lesser General Public License as published by the Free
      28             : !    Software Foundation.
      29             : !
      30             : !    ELPA is distributed in the hope that it will be useful,
      31             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : !    GNU Lesser General Public License for more details.
      34             : !
      35             : !    You should have received a copy of the GNU Lesser General Public License
      36             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : !
      38             : !    ELPA reflects a substantial effort on the part of the original
      39             : !    ELPA consortium, and we ask you to respect the spirit of the
      40             : !    license that we chose: i.e., please contribute any changes you
      41             : !    may have back to the original ELPA library distribution, and keep
      42             : !    any derivatives of ELPA under the same license that we chose for
      43             : !    the original distribution, the GNU Lesser General Public License.
      44             : !
      45             : !
      46             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      47             : !
      48             : ! Copyright of the original code rests with the authors inside the ELPA
      49             : ! consortium. The copyright of any additional modifications shall rest
      50             : ! with their original authors, but shall adhere to the licensing terms
      51             : ! distributed along with the original code in the file "COPYING".
      52       47448 :  function elpa_solve_evp_&
      53             :   &MATH_DATATYPE&
      54             :   &_&
      55             :   &2stage_&
      56             :   &PRECISION&
      57       46296 :   &_impl (obj, a, ev, q) result(success)
      58             : 
      59             :    use elpa_abstract_impl
      60             :    use elpa_utilities
      61             :    use elpa1_compute
      62             :    use elpa2_compute
      63             :    use elpa_mpi
      64             :    use cuda_functions
      65             :    use mod_check_for_gpu
      66             :    use iso_c_binding
      67             :    implicit none
      68             : #include "../general/precision_kinds.F90"
      69             :    class(elpa_abstract_impl_t), intent(inout)                         :: obj
      70             :    logical                                                            :: useGPU
      71             : #if REALCASE == 1
      72             :    logical                                                            :: useQR
      73             :    logical                                                            :: useQRActual
      74             : #endif
      75             :    integer(kind=c_int)                                                :: kernel
      76             : 
      77             : #ifdef USE_ASSUMED_SIZE
      78             :    MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,*)
      79             :    MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
      80             : #else
      81             :    MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,obj%local_ncols)
      82             :    MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
      83             : #endif
      84             :    real(kind=C_DATATYPE_KIND), intent(inout)                          :: ev(obj%na)
      85       47448 :    MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: hh_trans(:,:)
      86             : 
      87             :    integer(kind=c_int)                                                :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
      88             :    integer(kind=c_int)                                                :: nbw, num_blocks
      89             : #if COMPLEXCASE == 1
      90             :    integer(kind=c_int)                                                :: l_cols_nev, l_rows, l_cols
      91             : #endif
      92       47448 :    MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: tmat(:,:,:)
      93       47448 :    real(kind=C_DATATYPE_KIND), allocatable                            :: e(:)
      94             : #if COMPLEXCASE == 1
      95       18864 :    real(kind=C_DATATYPE_KIND), allocatable                            :: q_real(:,:)
      96             : #endif
      97       47448 :    MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target           :: q_dummy(:,:)
      98             :    MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer                       :: q_actual(:,:)
      99             : 
     100             : 
     101             :    integer(kind=c_intptr_t)                                           :: tmat_dev, q_dev, a_dev
     102             : 
     103             :    integer(kind=c_int)                                                :: i
     104             :    logical                                                            :: success, successCUDA
     105             :    logical                                                            :: wantDebug
     106             :    integer(kind=c_int)                                                :: istat, gpu, debug, qr
     107             :    character(200)                                                     :: errorMessage
     108             :    logical                                                            :: do_useGPU, do_useGPU_trans_ev_tridi
     109             :    integer(kind=c_int)                                                :: numberOfGPUDevices
     110             :    integer(kind=c_intptr_t), parameter                                :: size_of_datatype = size_of_&
     111             :                                                                                             &PRECISION&
     112             :                                                                                             &_&
     113             :                                                                                             &MATH_DATATYPE
     114             :     integer(kind=ik)                                                  :: na, nev, lda, ldq, nblk, matrixCols, &
     115             :                                                                          mpi_comm_rows, mpi_comm_cols,        &
     116             :                                                                          mpi_comm_all, check_pd, error
     117             : 
     118             :     logical                                                           :: do_bandred, do_tridiag, do_solve_tridi,  &
     119             :                                                                          do_trans_to_band, do_trans_to_full
     120             : 
     121             :     call obj%timer%start("elpa_solve_evp_&
     122             :     &MATH_DATATYPE&
     123             :     &_2stage_&
     124             :     &PRECISION&
     125       47448 :     &")
     126             : 
     127       47448 :     success = .true.
     128             : 
     129       47448 :     if (present(q)) then
     130       46296 :       obj%eigenvalues_only = .false.
     131             :     else
     132        1152 :       obj%eigenvalues_only = .true.
     133             :     endif
     134             : 
     135       47448 :     na         = obj%na
     136       47448 :     nev        = obj%nev
     137       47448 :     lda        = obj%local_nrows
     138       47448 :     ldq        = obj%local_nrows
     139       47448 :     nblk       = obj%nblk
     140       47448 :     matrixCols = obj%local_ncols
     141             : 
     142             :    ! special case na = 1
     143       47448 :    if (na .eq. 1) then
     144             : #if REALCASE == 1
     145           0 :      ev(1) = a(1,1)
     146             : #endif
     147             : #if COMPLEXCASE == 1
     148           0 :      ev(1) = real(a(1,1))
     149             : #endif
     150           0 :      if (.not.(obj%eigenvalues_only)) then
     151           0 :        q(1,1) = ONE
     152             :      endif
     153             :      call obj%timer%stop("elpa_solve_evp_&
     154             :      &MATH_DATATYPE&
     155             :      &_2stage_&
     156             :      &PRECISION&
     157           0 :      &")
     158           0 :      return
     159             :    endif
     160             : 
     161       47448 :    if (nev == 0) then
     162           0 :      nev = 1
     163           0 :      obj%eigenvalues_only = .true.
     164             :    endif
     165             : 
     166             : #if REALCASE == 1
     167       28584 :     call obj%get("real_kernel",kernel,error)
     168       28584 :     if (error .ne. ELPA_OK) then
     169           0 :       print *,"Problem getting option. Aborting..."
     170           0 :       stop
     171             :     endif
     172             :     ! check consistency between request for GPUs and defined kernel
     173       28584 :     call obj%get("gpu", gpu,error)
     174       28584 :     if (error .ne. ELPA_OK) then
     175           0 :       print *,"Problem getting option. Aborting..."
     176           0 :       stop
     177             :     endif
     178       28584 :     if (gpu == 1) then
     179           0 :       if (kernel .ne. ELPA_2STAGE_REAL_GPU) then
     180           0 :         write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
     181           0 :         write(error_unit,*) "The compute kernel will be executed on CPUs!"
     182           0 :       else if (nblk .ne. 128) then
     183           0 :         kernel = ELPA_2STAGE_REAL_GENERIC
     184             :       endif
     185             :     endif
     186       28584 :     if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
     187           0 :       if (gpu .ne. 1) then
     188           0 :         write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
     189             :       endif
     190             :     endif
     191             : 
     192             : #ifdef SINGLE_PRECISION_REAL
     193             :     ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
     194             :     if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
     195        7848 :         kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
     196             :         kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6        ) then
     197           0 :         write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
     198           0 :         write(error_unit,*) "The GENERIC kernel will be used at the moment"
     199           0 :         kernel = ELPA_2STAGE_REAL_GENERIC
     200             :     endif
     201             :     ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
     202             :     if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
     203        7848 :         kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
     204             :         kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6        ) then
     205           0 :         write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
     206           0 :         write(error_unit,*) "The GENERIC kernel will be used at the moment"
     207           0 :         kernel = ELPA_2STAGE_REAL_GENERIC
     208             :     endif
     209             : #endif
     210             : 
     211             : #endif
     212             : 
     213             : #if COMPLEXCASE == 1
     214       18864 :     call obj%get("complex_kernel",kernel,error)
     215       18864 :     if (error .ne. ELPA_OK) then
     216           0 :       print *,"Problem getting option. Aborting..."
     217           0 :       stop
     218             :     endif
     219             :     ! check consistency between request for GPUs and defined kernel
     220       18864 :     call obj%get("gpu", gpu,error)
     221       18864 :     if (error .ne. ELPA_OK) then
     222           0 :       print *,"Problem getting option. Aborting..."
     223           0 :       stop
     224             :     endif
     225       18864 :     if (gpu == 1) then
     226           0 :       if (kernel .ne. ELPA_2STAGE_COMPLEX_GPU) then
     227           0 :         write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
     228           0 :         write(error_unit,*) "The compute kernel will be executed on CPUs!"
     229           0 :       else if (nblk .ne. 128) then
     230           0 :         kernel = ELPA_2STAGE_COMPLEX_GENERIC
     231             :       endif
     232             :     endif
     233       18864 :     if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
     234           0 :       if (gpu .ne. 1) then
     235           0 :         write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
     236             :       endif
     237             :     endif
     238             : 
     239             : #endif
     240       47448 :     call obj%get("mpi_comm_rows",mpi_comm_rows,error)
     241       47448 :     if (error .ne. ELPA_OK) then
     242           0 :       print *,"Problem getting option. Aborting..."
     243           0 :       stop
     244             :     endif
     245       47448 :     call obj%get("mpi_comm_cols",mpi_comm_cols,error)
     246       47448 :     if (error .ne. ELPA_OK) then
     247           0 :       print *,"Problem getting option. Aborting..."
     248           0 :       stop
     249             :     endif
     250       47448 :     call obj%get("mpi_comm_parent",mpi_comm_all,error)
     251       47448 :     if (error .ne. ELPA_OK) then
     252           0 :       print *,"Problem getting option. Aborting..."
     253           0 :       stop
     254             :     endif
     255             : 
     256       47448 :     if (gpu .eq. 1) then
     257           0 :       useGPU = .true.
     258             :     else
     259       47448 :       useGPU = .false.
     260             :     endif
     261             : 
     262             : #if REALCASE == 1
     263       28584 :     call obj%get("qr",qr,error)
     264       28584 :     if (error .ne. ELPA_OK) then
     265           0 :       print *,"Problem getting option. Aborting..."
     266           0 :       stop
     267             :     endif
     268       28584 :     if (qr .eq. 1) then
     269           0 :       useQR = .true.
     270             :     else
     271       28584 :       useQR = .false.
     272             :     endif
     273             : 
     274             : #endif
     275       47448 :     call obj%timer%start("mpi_communication")
     276       47448 :     call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
     277       47448 :     call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
     278             : 
     279       47448 :     call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
     280       47448 :     call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
     281       47448 :     call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
     282       47448 :     call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
     283       47448 :     call obj%timer%stop("mpi_communication")
     284             : 
     285       47448 :     call obj%get("debug",debug,error)
     286       47448 :     if (error .ne. ELPA_OK) then
     287           0 :       print *,"Problem getting option. Aborting..."
     288           0 :       stop
     289             :     endif
     290       47448 :     wantDebug = debug == 1
     291             : 
     292       47448 :     do_useGPU      = .false.
     293       47448 :     do_useGPU_trans_ev_tridi =.false.
     294             : 
     295             : 
     296             : #if REALCASE == 1
     297       28584 :     useQRActual = .false.
     298             :     ! set usage of qr decomposition via API call
     299       28584 :     if (useQR) useQRActual = .true.
     300       28584 :     if (.not.(useQR)) useQRACtual = .false.
     301             : 
     302       28584 :     if (useQRActual) then
     303           0 :       if (mod(na,2) .ne. 0) then
     304           0 :         if (wantDebug) then
     305           0 :           write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
     306             :         endif
     307           0 :         print *, "Do not use QR-decomposition for this matrix and blocksize."
     308           0 :         success = .false.
     309           0 :         return
     310             :       endif
     311             :     endif
     312             : #endif /* REALCASE */
     313             : 
     314       47448 :     if (useGPU) then
     315           0 :       if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
     316             : 
     317           0 :          do_useGPU = .true.
     318             : 
     319             :          ! set the neccessary parameters
     320           0 :          cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
     321           0 :          cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
     322           0 :          cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
     323           0 :          cudaHostRegisterPortable = cuda_hostRegisterPortable()
     324           0 :          cudaHostRegisterMapped   = cuda_hostRegisterMapped()
     325             :       else
     326           0 :         print *,"GPUs are requested but not detected! Aborting..."
     327           0 :         success = .false.
     328           0 :         return
     329             :       endif
     330             :     else
     331             :       ! check whether set by environment variable
     332       47448 :       call obj%get("gpu",gpu,error)
     333       47448 :       if (error .ne. ELPA_OK) then
     334           0 :         print *,"Problem getting option. Aborting..."
     335           0 :         stop
     336             :       endif
     337       47448 :       do_useGPU = gpu == 1
     338       47448 :       if (do_useGPU) then
     339           0 :         if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
     340             : 
     341             :            ! set the neccessary parameters
     342           0 :            cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
     343           0 :            cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
     344           0 :            cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
     345           0 :            cudaHostRegisterPortable = cuda_hostRegisterPortable()
     346           0 :            cudaHostRegisterMapped   = cuda_hostRegisterMapped()
     347             :         else
     348           0 :           print *,"GPUs are requested but not detected! Aborting..."
     349           0 :           success = .false.
     350           0 :           return
     351             :         endif
     352             :       endif
     353             :     endif
     354             : 
     355             :     ! check consistency between request for GPUs and defined kernel
     356       47448 :     if (do_useGPU) then
     357           0 :       if (nblk .ne. 128) then
     358             :         ! cannot run on GPU with this blocksize
     359             :         ! disable GPU usage for trans_ev_tridi
     360           0 :         do_useGPU_trans_ev_tridi = .false.
     361             :       else
     362             : #if REALCASE == 1
     363           0 :         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
     364             : #endif
     365             : #if COMPLEXCASE == 1
     366           0 :         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
     367             : #endif
     368           0 :           do_useGPU_trans_ev_tridi = .true.
     369             :         else
     370           0 :           do_useGPU_trans_ev_tridi = .false.
     371             :         endif
     372             :       endif
     373             :     endif
     374             : 
     375             : 
     376             : 
     377       47448 :     if (.not. obj%eigenvalues_only) then
     378       46296 :       q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
     379             :     else
     380        1152 :      allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
     381        1152 :      q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
     382             :     endif
     383             : 
     384             : 
     385             :     ! set the default values for each of the 5 compute steps
     386       47448 :     do_bandred        = .true.
     387       47448 :     do_tridiag        = .true.
     388       47448 :     do_solve_tridi    = .true.
     389       47448 :     do_trans_to_band  = .true.
     390       47448 :     do_trans_to_full  = .true.
     391             : 
     392       47448 :     if (obj%eigenvalues_only) then
     393        1152 :       do_trans_to_band  = .false.
     394        1152 :       do_trans_to_full  = .false.
     395             :     endif
     396             : 
     397       47448 :     if (obj%is_set("bandwidth") == 1) then
     398         864 :       call obj%get("bandwidth",nbw,error)
     399         864 :       if (nbw == 0) then
     400           0 :         if (wantDebug) then
     401           0 :           write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
     402           0 :                               "for a diagonal matrix! This is too simple"
     403             :           endif
     404           0 :         print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
     405           0 :                  "for a diagonal matrix! This is too simple"
     406           0 :         success = .false.
     407           0 :         return
     408             :       endif
     409         864 :       if (mod(nbw, nblk) .ne. 0) then
     410             :         ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
     411             :         ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
     412           0 :         nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))
     413             : 
     414             :         ! just check that effective bandwidth is NOT larger than matrix size
     415           0 :         if (nbw .gt. na) then
     416           0 :           if (wantDebug) then
     417           0 :             write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
     418           0 :                                 "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
     419           0 :                                 "solve your problem by not specifing a bandwidth"
     420             :           endif
     421           0 :           print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
     422           0 :                                 "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
     423           0 :                                 "solve your problem by not specifing a bandwidth"
     424           0 :           success = .false.
     425           0 :           return
     426             :         endif
     427             :       endif
     428         864 :       do_bandred       = .false. ! we already have a banded matrix
     429         864 :       do_solve_tridi   = .true.  ! we also have to solve something :-)
     430         864 :       do_trans_to_band = .true.  ! and still we have to backsub to banded
     431         864 :       do_trans_to_full = .false. ! but not to full since we have a banded matrix
     432             :     else ! bandwidth is not set
     433             : 
     434             :       ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
     435             :       ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
     436             :       ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
     437             :       ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
     438             :       ! on this and maybe allow a run-time optimization here
     439       46584 :       if (do_useGPU) then
     440           0 :         nbw = nblk
     441             :       else
     442             : #if REALCASE == 1
     443       28152 :         nbw = (63/nblk+1)*nblk
     444             : #elif COMPLEXCASE == 1
     445       18432 :         nbw = (31/nblk+1)*nblk
     446             : #endif
     447             :       endif
     448             : 
     449       46584 :       num_blocks = (na-1)/nbw + 1
     450             : 
     451       46584 :       allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
     452       46584 :       if (istat .ne. 0) then
     453             :         print *,"solve_evp_&
     454             :         &MATH_DATATYPE&
     455             :         &_2stage_&
     456             :         &PRECISION&
     457           0 :         &" // ": error when allocating tmat "//errorMessage
     458           0 :         stop 1
     459             :       endif
     460             : 
     461       46584 :       do_bandred       = .true.
     462       46584 :       do_solve_tridi   = .true.
     463       46584 :       do_trans_to_band = .true.
     464       46584 :       do_trans_to_full = .true.
     465             :     end if  ! matrix not already banded on input
     466             : 
     467             :     ! start the computations in 5 steps
     468             : 
     469       47448 :     if (do_bandred) then
     470       46584 :       call obj%timer%start("bandred")
     471             :       ! Reduction full -> band
     472             :       call bandred_&
     473             :       &MATH_DATATYPE&
     474             :       &_&
     475             :       &PRECISION &
     476             :       (obj, na, a, &
     477             :       a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
     478             :       tmat_dev,  wantDebug, do_useGPU, success &
     479             : #if REALCASE == 1
     480             :       , useQRActual &
     481             : #endif
     482       46584 :       )
     483       46584 :       call obj%timer%stop("bandred")
     484       46584 :       if (.not.(success)) return
     485             :     endif
     486             : 
     487             : 
     488             :      ! Reduction band -> tridiagonal
     489       47448 :      if (do_tridiag) then
     490       47448 :        allocate(e(na), stat=istat, errmsg=errorMessage)
     491       47448 :        if (istat .ne. 0) then
     492             :          print *,"solve_evp_&
     493             :          &MATH_DATATYPE&
     494             :          &_2stage_&
     495           0 :          &PRECISION " // ": error when allocating e "//errorMessage
     496           0 :          stop 1
     497             :        endif
     498             : 
     499       47448 :        call obj%timer%start("tridiag")
     500             :        call tridiag_band_&
     501             :        &MATH_DATATYPE&
     502             :        &_&
     503             :        &PRECISION&
     504             :        (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
     505       47448 :         do_useGPU, wantDebug)
     506             : 
     507             : #ifdef WITH_MPI
     508       31632 :        call obj%timer%start("mpi_communication")
     509       31632 :        call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
     510       31632 :        call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
     511       31632 :        call obj%timer%stop("mpi_communication")
     512             : #endif /* WITH_MPI */
     513       79080 :        call obj%timer%stop("tridiag")
     514             :      endif ! do_tridiag
     515             : 
     516             : #if COMPLEXCASE == 1
     517       18864 :      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     518       18864 :      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     519       18864 :      l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
     520             : 
     521       18864 :      allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     522       18864 :      if (istat .ne. 0) then
     523             :        print *,"solve_evp_&
     524             :        &MATH_DATATYPE&
     525           0 :        &_2stage: error when allocating q_real"//errorMessage
     526           0 :        stop 1
     527             :      endif
     528             : #endif
     529             : 
     530             :      ! Solve tridiagonal system
     531       47448 :      if (do_solve_tridi) then
     532       47448 :        call obj%timer%start("solve")
     533             :        call solve_tridi_&
     534             :        &PRECISION &
     535             :        (obj, na, nev, ev, e, &
     536             : #if REALCASE == 1
     537             :        q_actual, ldq,   &
     538             : #endif
     539             : #if COMPLEXCASE == 1
     540             :        q_real, ubound(q_real,dim=1), &
     541             : #endif
     542       47448 :        nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU, wantDebug, success)
     543       47448 :        call obj%timer%stop("solve")
     544       47448 :        if (.not.(success)) return
     545             :      endif ! do_solve_tridi
     546             : 
     547       47448 :      deallocate(e, stat=istat, errmsg=errorMessage)
     548       47448 :      if (istat .ne. 0) then
     549             :        print *,"solve_evp_&
     550             :        &MATH_DATATYPE&
     551           0 :        &_2stage: error when deallocating e "//errorMessage
     552           0 :        stop 1
     553             :      endif
     554             : 
     555       47448 :      if (obj%eigenvalues_only) then
     556        1152 :        do_trans_to_band = .false.
     557        1152 :        do_trans_to_full = .false.
     558             :      else
     559             : 
     560       46296 :        call obj%get("check_pd",check_pd,error)
     561       46296 :        if (error .ne. ELPA_OK) then
     562           0 :          print *,"Problem getting option. Aborting..."
     563           0 :          stop
     564             :        endif
     565       46296 :        if (check_pd .eq. 1) then
     566           0 :          check_pd = 0
     567           0 :          do i = 1, na
     568           0 :            if (ev(i) .gt. THRESHOLD) then
     569           0 :              check_pd = check_pd + 1
     570             :            endif
     571             :          enddo
     572           0 :          if (check_pd .lt. na) then
     573             :            ! not positiv definite => eigenvectors needed
     574           0 :            do_trans_to_band = .true.
     575           0 :            do_trans_to_full = .true.
     576             :          else
     577           0 :            do_trans_to_band = .false.
     578           0 :            do_trans_to_full = .false.
     579             :          endif
     580             :        endif
     581             :      endif ! eigenvalues only
     582             : 
     583       47448 :      if (do_trans_to_band) then
     584             : #if COMPLEXCASE == 1
     585             :        ! q must be given thats why from here on we can use q and not q_actual
     586             : 
     587       18432 :        q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
     588             : 
     589       18432 :        deallocate(q_real, stat=istat, errmsg=errorMessage)
     590       18432 :        if (istat .ne. 0) then
     591             :          print *,"solve_evp_&
     592             :          &MATH_DATATYPE&
     593           0 :          &_2stage: error when deallocating q_real"//errorMessage
     594           0 :          stop 1
     595             :        endif
     596             : #endif
     597             : 
     598             :        ! Backtransform stage 1
     599       46296 :        call obj%timer%start("trans_ev_to_band")
     600             : 
     601             :        call trans_ev_tridi_to_band_&
     602             :        &MATH_DATATYPE&
     603             :        &_&
     604             :        &PRECISION &
     605             :        (obj, na, nev, nblk, nbw, q, &
     606             :        q_dev, &
     607             :        ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi, &
     608       46296 :        success=success, kernel=kernel)
     609       46296 :        call obj%timer%stop("trans_ev_to_band")
     610             : 
     611       46296 :        if (.not.(success)) return
     612             : 
     613             :        ! We can now deallocate the stored householder vectors
     614       46296 :        deallocate(hh_trans, stat=istat, errmsg=errorMessage)
     615       46296 :        if (istat .ne. 0) then
     616             :          print *, "solve_evp_&
     617             :          &MATH_DATATYPE&
     618             :          &_2stage_&
     619           0 :          &PRECISION " // ": error when deallocating hh_trans "//errorMessage
     620           0 :          stop 1
     621             :        endif
     622             :      endif ! do_trans_to_band
     623             : 
     624       47448 :      if (do_trans_to_full) then
     625       45432 :        call obj%timer%start("trans_ev_to_full")
     626       45432 :        if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi) ) then
     627             :          ! copy to device if we want to continue on GPU
     628           0 :          successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
     629             : 
     630           0 :          successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
     631             :        endif
     632             : 
     633             :        ! Backtransform stage 2
     634             : 
     635             :        call trans_ev_band_to_full_&
     636             :        &MATH_DATATYPE&
     637             :        &_&
     638             :        &PRECISION &
     639             :        (obj, na, nev, nblk, nbw, a, &
     640             :        a_dev, lda, tmat, tmat_dev,  q,  &
     641             :        q_dev, &
     642             :        ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
     643             : #if REALCASE == 1
     644             :        , useQRActual  &
     645             : #endif
     646       45432 :        )
     647             : 
     648             : 
     649       45432 :        deallocate(tmat, stat=istat, errmsg=errorMessage)
     650       45432 :        if (istat .ne. 0) then
     651             :          print *,"solve_evp_&
     652             :          &MATH_DATATYPE&
     653             :          &_2stage_&
     654           0 :          &PRECISION " // ": error when deallocating tmat"//errorMessage
     655           0 :          stop 1
     656             :        endif
     657       45432 :        call obj%timer%stop("trans_ev_to_full")
     658             :      endif ! do_trans_to_full
     659             : 
     660       47448 :      if (obj%eigenvalues_only) then
     661        1152 :        deallocate(q_dummy, stat=istat, errmsg=errorMessage)
     662        1152 :        if (istat .ne. 0) then
     663             :          print *,"solve_evp_&
     664             :          &MATH_DATATYPE&
     665             :          &_1stage_&
     666             :          &PRECISION&
     667           0 :          &" // ": error when deallocating q_dummy "//errorMessage
     668           0 :          stop 1
     669             :        endif
     670             :      endif
     671             : 
     672             :      call obj%timer%stop("elpa_solve_evp_&
     673             :      &MATH_DATATYPE&
     674             :      &_2stage_&
     675             :     &PRECISION&
     676       47448 :     &")
     677             : 1    format(a,f10.3)
     678             : 
     679             :    end function elpa_solve_evp_&
     680             :    &MATH_DATATYPE&
     681             :    &_2stage_&
     682             :    &PRECISION&
     683       94896 :    &_impl
     684             : 
     685             : ! vim: syntax=fortran

Generated by: LCOV version 1.12