LCOV - coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info - src/elpa2/elpa2_compute_real

LCOV - code coverage report

Current view:	top level - src/elpa2 - elpa2_compute_real_template.F90 (source / functions)		Hit	Total	Coverage
Test:	coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info	Lines:	0	199	0.0 %
Date:	2018-01-10 09:29:53	Functions:	0	10	0.0 %

          Line data    Source code

       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !    This particular source code file contains additions, changes and
      20             : !    enhancements authored by Intel Corporation which is not part of
      21             : !    the ELPA consortium.
      22             : !
      23             : !    More information can be found here:
      24             : !    http://elpa.mpcdf.mpg.de/
      25             : !
      26             : !    ELPA is free software: you can redistribute it and/or modify
      27             : !    it under the terms of the version 3 of the license of the
      28             : !    GNU Lesser General Public License as published by the Free
      29             : !    Software Foundation.
      30             : !
      31             : !    ELPA is distributed in the hope that it will be useful,
      32             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      33             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      34             : !    GNU Lesser General Public License for more details.
      35             : !
      36             : !    You should have received a copy of the GNU Lesser General Public License
      37             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      38             : !
      39             : !    ELPA reflects a substantial effort on the part of the original
      40             : !    ELPA consortium, and we ask you to respect the spirit of the
      41             : !    license that we chose: i.e., please contribute any changes you
      42             : !    may have back to the original ELPA library distribution, and keep
      43             : !    any derivatives of ELPA under the same license that we chose for
      44             : !    the original distribution, the GNU Lesser General Public License.
      45             : !
      46             : ! ELPA2 -- 2-stage solver for ELPA
      47             : !
      48             : ! Copyright of the original code rests with the authors inside the ELPA
      49             : ! consortium. The copyright of any additional modifications shall rest
      50             : ! with their original authors, but shall adhere to the licensing terms
      51             : ! distributed along with the original code in the file "COPYING".
      52             : !
      53             : ! Author: Andreas Marek, MPCDF
      54             : #endif
      55             : 
      56             : #include "../general/sanity.F90"
      57             : 
      58             : #define REALCASE 1
      59             : #undef COMPLEXCASE
      60             : #include "elpa2_bandred_template.F90"
      61             : #define REALCASE 1
      62             : #include "elpa2_symm_matrix_allreduce_real_template.F90"
      63             : #undef REALCASE
      64             : #define REALCASE 1
      65             : #include "elpa2_trans_ev_band_to_full_template.F90"
      66             : #include "elpa2_tridiag_band_template.F90"
      67             : #include "elpa2_trans_ev_tridi_to_band_template.F90"
      68             : 
      69             : 
      70             : 
      71             :     subroutine band_band_real_&
      72           0 : &PRECISION &
      73           0 :                   (obj, na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, communicator)
      74             :     !-------------------------------------------------------------------------------
      75             :     ! band_band_real:
      76             :     ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
      77             :     ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
      78             :     ! zero entries, if this
      79             :     ! requirement doesn't hold)
      80             :     !
      81             :     !  na          Order of matrix
      82             :     !
      83             :     !  nb          Semi bandwidth of original matrix
      84             :     !
      85             :     !  nb2         Semi bandwidth of target matrix
      86             :     !
      87             :     !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
      88             :     !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
      89             :     !              are located on rank n.
      90             :     !
      91             :     !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
      92             :     !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
      93             :     !              on rank n.
      94             :     !
      95             :     !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
      96             :     !
      97             :     !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
      98             :     !
      99             :     !  communicator
     100             :     !              MPI-Communicator for the total processor set
     101             :     !-------------------------------------------------------------------------------
     102             :       use elpa_abstract_impl
     103             :       use elpa2_workload
     104             :       use precision
     105             :       implicit none
     106             : #include "../general/precision_kinds.F90"
     107             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     108             :       integer(kind=ik), intent(in)               :: na, nb, nbCol, nb2, nb2Col, communicator
     109             :       real(kind=rk), intent(inout)               :: ab(2*nb,nbCol) ! removed assumed size
     110             :       real(kind=rk), intent(inout)               :: ab2(2*nb2,nb2Col) ! removed assumed size
     111             :       real(kind=rk), intent(out)                 :: d(na), e(na) ! set only on PE 0
     112             : 
     113           0 :       real(kind=rk)                              :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
     114           0 :                                                   tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
     115             : 
     116           0 :       real(kind=rk)                              :: work(nb*nb2), work2(nb2*nb2)
     117             :       integer(kind=ik)                         :: lwork, info
     118             : 
     119             :       integer(kind=ik)                         :: istep, i, n, dest
     120             :       integer(kind=ik)                         :: n_off, na_s
     121             :       integer(kind=ik)                         :: my_pe, n_pes, mpierr
     122             :       integer(kind=ik)                         :: nblocks_total, nblocks
     123             :       integer(kind=ik)                         :: nblocks_total2, nblocks2
     124             :       integer(kind=ik)                         :: ireq_ab, ireq_hv
     125             : #ifdef WITH_MPI
     126             : !      integer(kind=ik)                         :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
     127             : #endif
     128             : !      integer(kind=ik), allocatable            :: mpi_statuses(:,:)
     129           0 :       integer(kind=ik), allocatable            :: block_limits(:), block_limits2(:), ireq_ab2(:)
     130             : 
     131             :       integer(kind=ik)                         :: j, nc, nr, ns, ne, iblk
     132             :       integer(kind=ik)                         :: istat
     133             :       character(200)                           :: errorMessage
     134             : 
     135           0 :       call obj%timer%start("band_band_real" // PRECISION_SUFFIX)
     136             : 
     137           0 :       call obj%timer%start("mpi_communication")
     138           0 :       call mpi_comm_rank(communicator,my_pe,mpierr)
     139           0 :       call mpi_comm_size(communicator,n_pes,mpierr)
     140           0 :       call obj%timer%stop("mpi_communication")
     141             : 
     142             :       ! Total number of blocks in the band:
     143           0 :       nblocks_total = (na-1)/nb + 1
     144           0 :       nblocks_total2 = (na-1)/nb2 + 1
     145             : 
     146             :       ! Set work distribution
     147           0 :       allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
     148           0 :       if (istat .ne. 0) then
     149           0 :         print *,"error allocating block_limits "//errorMessage
     150           0 :         stop 1
     151             :       endif
     152           0 :       call divide_band(obj, nblocks_total, n_pes, block_limits)
     153             : 
     154           0 :       allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
     155           0 :       if (istat .ne. 0) then
     156           0 :         print *,"error allocating block_limits2 "//errorMessage
     157           0 :         stop 1
     158             :       endif
     159             : 
     160           0 :       call divide_band(obj, nblocks_total2, n_pes, block_limits2)
     161             : 
     162             :       ! nblocks: the number of blocks for my task
     163           0 :       nblocks = block_limits(my_pe+1) - block_limits(my_pe)
     164           0 :       nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
     165             : 
     166           0 :       allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
     167           0 :       if (istat .ne. 0) then
     168           0 :         print *,"error allocating ireq_ab2 "//errorMessage
     169           0 :         stop 1
     170             :       endif
     171             : 
     172             : #ifdef WITH_MPI
     173           0 :       call obj%timer%start("mpi_communication")
     174             : 
     175           0 :       ireq_ab2 = MPI_REQUEST_NULL
     176             : 
     177           0 :       if (nb2>1) then
     178           0 :         do i=0,nblocks2-1
     179             : 
     180           0 :           call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, MPI_REAL_PRECISION, 0, 3, communicator, ireq_ab2(i+1), mpierr)
     181             :         enddo
     182             :       endif
     183           0 :       call obj%timer%stop("mpi_communication")
     184             : 
     185             : #else /* WITH_MPI */
     186             :       ! carefull the "recieve" has to be done at the corresponding send or wait
     187             : !      if (nb2>1) then
     188             : !        do i=0,nblocks2-1
     189             : !          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
     190             : !        enddo
     191             : !      endif
     192             : 
     193             : #endif /* WITH_MPI */
     194             :       ! n_off: Offset of ab within band
     195           0 :       n_off = block_limits(my_pe)*nb
     196           0 :       lwork = nb*nb2
     197           0 :       dest = 0
     198             : #ifdef WITH_MPI
     199           0 :       ireq_ab = MPI_REQUEST_NULL
     200           0 :       ireq_hv = MPI_REQUEST_NULL
     201             : #endif
     202             :       ! ---------------------------------------------------------------------------
     203             :       ! Start of calculations
     204             : 
     205           0 :       na_s = block_limits(my_pe)*nb + 1
     206             : 
     207           0 :       if (my_pe>0 .and. na_s<=na) then
     208             :         ! send first nb2 columns to previous PE
     209             :         ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
     210           0 :         do i=1,nb2
     211           0 :           ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
     212             :         enddo
     213             : #ifdef WITH_MPI
     214           0 :         call obj%timer%start("mpi_communication")
     215             : 
     216           0 :         call mpi_isend(ab_s, (nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, communicator, ireq_ab, mpierr)
     217           0 :         call obj%timer%stop("mpi_communication")
     218             : #endif /* WITH_MPI */
     219             :       endif
     220             : 
     221           0 :       do istep=1,na/nb2
     222             : 
     223           0 :         if (my_pe==0) then
     224             : 
     225           0 :           n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
     226           0 :           hv(:,:) = 0.0_rk
     227           0 :           tau(:) = 0.0_rk
     228             : 
     229             :           ! The last step (istep=na-1) is only needed for sending the last HH vectors.
     230             :           ! We don't want the sign of the last element flipped (analogous to the other sweeps)
     231           0 :           if (istep < na/nb2) then
     232             : 
     233             :             ! Transform first block column of remaining matrix
     234           0 :       call obj%timer%start("blas")
     235           0 :             call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
     236           0 :       call obj%timer%stop("blas")
     237             : 
     238           0 :             do i=1,nb2
     239           0 :               hv(i,i) = 1.0_rk
     240           0 :               hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
     241           0 :               ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0.0_rk
     242             :             enddo
     243             : 
     244             :           endif
     245             : 
     246           0 :           if (nb2==1) then
     247           0 :             d(istep) = ab(1,na_s-n_off)
     248           0 :             e(istep) = ab(2,na_s-n_off)
     249           0 :             if (istep == na) then
     250           0 :               e(na) = 0.0_rk
     251             :             endif
     252             :           else
     253           0 :             ab_s2 = 0.0_rk
     254           0 :             ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
     255           0 :             if (block_limits2(dest+1)<istep) then
     256           0 :               dest = dest+1
     257             :             endif
     258             : #ifdef WITH_MPI
     259           0 :             call obj%timer%start("mpi_communication")
     260           0 :             call mpi_send(ab_s2, 2*nb2*nb2, MPI_REAL_PRECISION, dest, 3, communicator, mpierr)
     261           0 :             call obj%timer%stop("mpi_communication")
     262             : 
     263             : #else /* WITH_MPI */
     264             :             ! do irecv here
     265           0 :             if (nb2>1) then
     266           0 :               do i= 0,nblocks2-1
     267           0 :                 ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
     268             :               enddo
     269             :             endif
     270             : #endif /* WITH_MPI */
     271             : 
     272             :           endif
     273             : 
     274             :         else
     275           0 :           if (na>na_s+nb2-1) then
     276             :             ! Receive Householder vectors from previous task, from PE owning subdiagonal
     277             : #ifdef WITH_MPI
     278           0 :             call obj%timer%start("mpi_communication")
     279           0 :             call mpi_recv(hv, nb*nb2, MPI_REAL_PRECISION, my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
     280           0 :             call obj%timer%stop("mpi_communication")
     281             : 
     282             : #else /* WITH_MPI */
     283           0 :            hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
     284             : #endif /* WITH_MPI */
     285             : 
     286           0 :             do i=1,nb2
     287           0 :               tau(i) = hv(i,i)
     288           0 :               hv(i,i) = 1.0_rk
     289             :             enddo
     290             :           endif
     291             :         endif
     292             : 
     293           0 :         na_s = na_s+nb2
     294           0 :         if (na_s-n_off > nb) then
     295           0 :           ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
     296           0 :           ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rk
     297           0 :           n_off = n_off + nb
     298             :         endif
     299             : 
     300           0 :         do iblk=1,nblocks
     301           0 :           ns = na_s + (iblk-1)*nb - n_off ! first column in block
     302           0 :           ne = ns+nb-nb2                    ! last column in block
     303             : 
     304           0 :           if (ns+n_off>na) exit
     305             : 
     306           0 :             nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
     307           0 :             nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
     308             :                                           ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
     309             :             call wy_gen_&
     310             :       &PRECISION&
     311           0 :       &(obj,nc,nb2,w,hv,tau,work,nb)
     312             : 
     313           0 :             if (iblk==nblocks .and. nc==nb) then
     314             :               !request last nb2 columns
     315             : #ifdef WITH_MPI
     316           0 :               call obj%timer%start("mpi_communication")
     317           0 :               call mpi_recv(ab_r,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
     318           0 :               call obj%timer%stop("mpi_communication")
     319             : 
     320             : #else /* WITH_MPI */
     321           0 :              ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
     322             : #endif /* WITH_MPI */
     323           0 :               do i=1,nb2
     324           0 :                 ab(1:nb+1,ne+i-1) = ab_r(:,i)
     325             :               enddo
     326             :             endif
     327           0 :             hv_new(:,:) = 0.0_rk ! Needed, last rows must be 0 for nr < nb
     328           0 :             tau_new(:) = 0.0_rk
     329             : 
     330           0 :             if (nr>0) then
     331             :               call wy_right_&
     332             :         &PRECISION&
     333           0 :         &(obj,nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
     334           0 :         call obj%timer%start("blas")
     335           0 :               call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
     336           0 :         call obj%timer%stop("blas")
     337           0 :               do i=1,nb2
     338           0 :                 hv_new(i,i) = 1.0_rk
     339           0 :                 hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
     340           0 :                 ab(nb+2:,ns+i-1) = 0.0_rk
     341             :               enddo
     342             : 
     343             :               !send hh-Vector
     344           0 :               if (iblk==nblocks) then
     345             : #ifdef WITH_MPI
     346           0 :                 call obj%timer%start("mpi_communication")
     347             : 
     348           0 :                 call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
     349           0 :                 call obj%timer%stop("mpi_communication")
     350             : 
     351             : #endif
     352           0 :                 hv_s = hv_new
     353           0 :                 do i=1,nb2
     354           0 :                   hv_s(i,i) = tau_new(i)
     355             :                 enddo
     356             : #ifdef WITH_MPI
     357           0 :                 call obj%timer%start("mpi_communication")
     358           0 :                 call mpi_isend(hv_s,nb*nb2, MPI_REAL_PRECISION, my_pe+1, 2, communicator, ireq_hv, mpierr)
     359           0 :                 call obj%timer%stop("mpi_communication")
     360             : 
     361             : #else /* WITH_MPI */
     362             : 
     363             : #endif /* WITH_MPI */
     364             :               endif
     365             :             endif
     366             : 
     367             :             call wy_symm_&
     368             :       &PRECISION&
     369           0 :       &(obj,nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
     370             : 
     371           0 :             if (my_pe>0 .and. iblk==1) then
     372             :               !send first nb2 columns to previous PE
     373             : #ifdef WITH_MPI
     374           0 :               call obj%timer%start("mpi_communication")
     375             : 
     376           0 :               call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
     377           0 :               call obj%timer%stop("mpi_communication")
     378             : 
     379             : #endif
     380           0 :               do i=1,nb2
     381           0 :                 ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
     382             :               enddo
     383             : #ifdef WITH_MPI
     384           0 :               call obj%timer%start("mpi_communication")
     385           0 :               call mpi_isend(ab_s,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, communicator, ireq_ab, mpierr)
     386           0 :               call obj%timer%stop("mpi_communication")
     387             : 
     388             : #else /* WITH_MPI */
     389             : 
     390             : #endif /* WITH_MPI */
     391             :             endif
     392             : 
     393           0 :             if (nr>0) then
     394             :               call wy_gen_&
     395             :         &PRECISION&
     396           0 :         &(obj,nr,nb2,w_new,hv_new,tau_new,work,nb)
     397             :               call wy_left_&
     398             :         &PRECISION&
     399           0 :         &(obj,nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
     400             :             endif
     401             : 
     402             :             ! Use new HH Vector for the next block
     403           0 :             hv(:,:) = hv_new(:,:)
     404           0 :             tau = tau_new
     405             :           enddo
     406             :         enddo
     407             : 
     408             :         ! Finish the last outstanding requests
     409             : #ifdef WITH_MPI
     410           0 :          call obj%timer%start("mpi_communication")
     411             : 
     412           0 :         call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
     413           0 :         call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
     414             : !        allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
     415             : !        if (istat .ne. 0) then
     416             : !          print *,"error allocating mpi_statuses "//errorMessage
     417             : !          stop 1
     418             : !        endif
     419             : 
     420           0 :         call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
     421             : !        deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
     422             : !        if (istat .ne. 0) then
     423             : !          print *,"error deallocating mpi_statuses "//errorMessage
     424             : !          stop 1
     425             : !        endif
     426             : 
     427           0 :         call mpi_barrier(communicator,mpierr)
     428           0 :         call obj%timer%stop("mpi_communication")
     429             : 
     430             : #endif /* WITH_MPI */
     431             : 
     432           0 :         deallocate(block_limits, stat=istat, errmsg=errorMessage)
     433           0 :         if (istat .ne. 0) then
     434           0 :           print *,"error deallocating block_limits "//errorMessage
     435           0 :           stop 1
     436             :         endif
     437             : 
     438           0 :         deallocate(block_limits2, stat=istat, errmsg=errorMessage)
     439           0 :         if (istat .ne. 0) then
     440           0 :           print *,"error deallocating block_limits2 "//errorMessage
     441           0 :           stop 1
     442             :         endif
     443             : 
     444           0 :         deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
     445           0 :         if (istat .ne. 0) then
     446           0 :           print *,"error deallocating ireq_ab2 "//errorMessage
     447           0 :           stop 1
     448             :         endif
     449             : 
     450           0 :         call obj%timer%stop("band_band_real" // PRECISION_SUFFIX)
     451             : 
     452           0 :     end subroutine
     453             : 
     454             :     subroutine wy_gen_&
     455           0 :     &PRECISION&
     456           0 :     &(obj, n, nb, W, Y, tau, mem, lda)
     457             : 
     458             :       use elpa_abstract_impl
     459             :       use precision
     460             :       implicit none
     461             : #include "../general/precision_kinds.F90"
     462             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     463             :       integer(kind=ik), intent(in)            :: n      !length of householder-vectors
     464             :       integer(kind=ik), intent(in)            :: nb     !number of householder-vectors
     465             :       integer(kind=ik), intent(in)            :: lda        !leading dimension of Y and W
     466             :       real(kind=rk), intent(in)               :: Y(lda,nb)  !matrix containing nb householder-vectors of length b
     467             :       real(kind=rk), intent(in)               :: tau(nb)    !tau values
     468             :       real(kind=rk), intent(out)              :: W(lda,nb)  !output matrix W
     469             :       real(kind=rk), intent(in)               :: mem(nb)    !memory for a temporary matrix of size nb
     470             : 
     471             :       integer(kind=ik)                        :: i
     472             : 
     473           0 :    call obj%timer%start("wy_gen" // PRECISION_SUFFIX)
     474             : 
     475           0 :    W(1:n,1) = tau(1)*Y(1:n,1)
     476           0 :    do i=2,nb
     477           0 :      W(1:n,i) = tau(i)*Y(1:n,i)
     478           0 :      call obj%timer%start("blas")
     479           0 :      call PRECISION_GEMV('T', n, i-1,  1.0_rk, Y, lda, W(1,i), 1, 0.0_rk, mem,1)
     480           0 :      call PRECISION_GEMV('N', n, i-1, -1.0_rk, W, lda, mem, 1, 1.0_rk, W(1,i),1)
     481           0 :      call obj%timer%stop("blas")
     482             :    enddo
     483           0 :    call obj%timer%stop("wy_gen" // PRECISION_SUFFIX)
     484           0 :     end subroutine
     485             : 
     486             :     subroutine wy_left_&
     487           0 :     &PRECISION&
     488           0 :     &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
     489             : 
     490             :       use precision
     491             :       use elpa_abstract_impl
     492             :       implicit none
     493             : #include "../general/precision_kinds.F90"
     494             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     495             :       integer(kind=ik), intent(in)            :: n      !width of the matrix A
     496             :       integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
     497             :       integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
     498             :       integer(kind=ik), intent(in)            :: lda        !leading dimension of A
     499             :       integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
     500             :       real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed   ! remove assumed size
     501             :       real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
     502             :       real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
     503             :       real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
     504             : 
     505           0 :    call obj%timer%start("wy_left" // PRECISION_SUFFIX)
     506           0 :    call obj%timer%start("blas")
     507           0 :    call PRECISION_GEMM('T', 'N', nb, n, m, 1.0_rk, W, lda2, A, lda, 0.0_rk, mem, nb)
     508           0 :    call PRECISION_GEMM('N', 'N', m, n, nb, -1.0_rk, Y, lda2, mem, nb, 1.0_rk, A, lda)
     509           0 :    call obj%timer%stop("blas")
     510           0 :    call obj%timer%stop("wy_left" // PRECISION_SUFFIX)
     511           0 :     end subroutine
     512             : 
     513             :     subroutine wy_right_&
     514           0 :     &PRECISION&
     515           0 :     &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
     516             : 
     517             :       use precision
     518             :       use elpa_abstract_impl
     519             :       implicit none
     520             : #include "../general/precision_kinds.F90"
     521             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     522             :       integer(kind=ik), intent(in)            :: n      !height of the matrix A
     523             :       integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
     524             :       integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
     525             :       integer(kind=ik), intent(in)            :: lda        !leading dimension of A
     526             :       integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
     527             :       real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
     528             :       real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
     529             :       real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
     530             :       real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
     531             : 
     532             : 
     533           0 :       call obj%timer%start("wy_right" // PRECISION_SUFFIX)
     534           0 :       call obj%timer%start("blas")
     535           0 :       call PRECISION_GEMM('N', 'N', n, nb, m, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n)
     536           0 :       call PRECISION_GEMM('N', 'T', n, m, nb, -1.0_rk, mem, n, Y, lda2, 1.0_rk, A, lda)
     537           0 :       call obj%timer%stop("blas")
     538           0 :       call obj%timer%stop("wy_right" // PRECISION_SUFFIX)
     539             : 
     540           0 :     end subroutine
     541             : 
     542             :     subroutine wy_symm_&
     543           0 :     &PRECISION&
     544           0 :     &(obj, n, nb, A, lda, W, Y, mem, mem2, lda2)
     545             : 
     546             :       use elpa_abstract_impl
     547             :       use precision
     548             :       implicit none
     549             : #include "../general/precision_kinds.F90"
     550             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     551             :       integer(kind=ik), intent(in)            :: n      !width/heigth of the matrix A; length of matrix W and Y
     552             :       integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
     553             :       integer(kind=ik), intent(in)            :: lda        !leading dimension of A
     554             :       integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
     555             :       real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
     556             :       real(kind=rk), intent(in)               :: W(n,nb)    !blocked transformation matrix W
     557             :       real(kind=rk), intent(in)               :: Y(n,nb)    !blocked transformation matrix Y
     558             :       real(kind=rk)                           :: mem(n,nb)  !memory for a temporary matrix of size n x nb
     559             :       real(kind=rk)                           :: mem2(nb,nb)    !memory for a temporary matrix of size nb x nb
     560             : 
     561           0 :       call obj%timer%start("wy_symm" // PRECISION_SUFFIX)
     562           0 :       call obj%timer%start("blas")
     563           0 :       call PRECISION_SYMM('L', 'L', n, nb, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n)
     564           0 :       call PRECISION_GEMM('T', 'N', nb, nb, n, 1.0_rk, mem, n, W, lda2, 0.0_rk, mem2, nb)
     565           0 :       call PRECISION_GEMM('N', 'N', n, nb, nb, -0.5_rk, Y, lda2, mem2, nb, 1.0_rk, mem, n)
     566           0 :       call PRECISION_SYR2K('L', 'N', n, nb, -1.0_rk, Y, lda2, mem, n, 1.0_rk, A, lda)
     567           0 :       call obj%timer%stop("blas")
     568           0 :       call obj%timer%stop("wy_symm" // PRECISION_SUFFIX)
     569             : 
     570           0 :     end subroutine
     571             :

Generated by: LCOV version 1.12