LCOV - coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info - src/elpa2/qr/elpa_qrkernels

LCOV - code coverage report

Current view:	top level - src/elpa2/qr - elpa_qrkernels_template.F90 (source / functions)		Hit	Total	Coverage
Test:	coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info	Lines:	0	391	0.0 %
Date:	2018-01-10 09:29:53	Functions:	0	10	0.0 %

          Line data    Source code

       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !
      20             : !    More information can be found here:
      21             : !    http://elpa.mpcdf.mpg.de/
      22             : !
      23             : !    ELPA is free software: you can redistribute it and/or modify
      24             : !    it under the terms of the version 3 of the license of the
      25             : !    GNU Lesser General Public License as published by the Free
      26             : !    Software Foundation.
      27             : !
      28             : !    ELPA is distributed in the hope that it will be useful,
      29             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      30             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      31             : !    GNU Lesser General Public License for more details.
      32             : !
      33             : !    You should have received a copy of the GNU Lesser General Public License
      34             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      35             : !
      36             : !    ELPA reflects a substantial effort on the part of the original
      37             : !    ELPA consortium, and we ask you to respect the spirit of the
      38             : !    license that we chose: i.e., please contribute any changes you
      39             : !    may have back to the original ELPA library distribution, and keep
      40             : !    any derivatives of ELPA under the same license that we chose for
      41             : !    the original distribution, the GNU Lesser General Public License.
      42             : !
      43             : 
      44             : #endif
      45             : 
      46             : ! calculates A = A - Y*T'*Z (rev=0)
      47             : ! calculates A = A - Y*T*Z (rev=1)
      48             : ! T upper triangle matrix
      49             : ! assuming zero entries in matrix in upper kxk block
      50             : 
      51             : subroutine qr_pdlarfb_kernel_local_&
      52           0 : &PRECISION &
      53           0 : (m,n,k,a,lda,v,ldv,t,ldt,z,ldz)
      54             :     use precision
      55             :     implicit none
      56             : 
      57             :     ! input variables (local)
      58             :     integer(kind=ik) :: lda,ldv,ldt,ldz
      59             :     real(kind=REAL_DATATYPE)    :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*)
      60             : 
      61             :     ! input variables (global)
      62             :     integer(kind=ik) :: m,n,k
      63             : 
      64             :     ! local variables
      65             :     real(kind=REAL_DATATYPE)    :: t11
      66             :     real(kind=REAL_DATATYPE)    :: t12,t22,sum1,sum2
      67             :     real(kind=REAL_DATATYPE)    :: t13,t23,t33,sum3
      68             :     real(kind=REAL_DATATYPE)    :: sum4,t44
      69             :     real(kind=REAL_DATATYPE)    :: y1,y2,y3,y4
      70             :     real(kind=REAL_DATATYPE)    :: a1
      71             :     integer(kind=ik) :: icol,irow,v1col,v2col,v3col
      72             : 
      73             :     ! reference implementation
      74           0 :     if (k .eq. 1) then
      75           0 :         t11 = t(1,1)
      76           0 :         do icol=1,n
      77           0 :             sum1 = z(1,icol)
      78           0 :             a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1)
      79             :         enddo
      80           0 :         return
      81           0 :     else if (k .eq. 2) then
      82           0 :             v1col = 2
      83           0 :             v2col = 1
      84           0 :             t22 = t(1,1)
      85           0 :             t12 = t(1,2)
      86           0 :             t11 = t(2,2)
      87             : 
      88           0 :         do icol=1,n
      89           0 :             sum1 = t11 * z(v1col,icol)
      90           0 :             sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol)
      91             : 
      92           0 :             do irow=1,m
      93           0 :                 a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2
      94             :             end do
      95             :         end do
      96           0 :     else if (k .eq. 3) then
      97           0 :             v1col = 3
      98           0 :             v2col = 2
      99           0 :             v3col = 1
     100             : 
     101           0 :             t33 = t(1,1)
     102             : 
     103           0 :             t23 = t(1,2)
     104           0 :             t22 = t(2,2)
     105             : 
     106           0 :             t13 = t(1,3)
     107           0 :             t12 = t(2,3)
     108           0 :             t11 = t(3,3)
     109             : 
     110           0 :         do icol=1,n
     111             :             ! misusing variables for fetch of z parts
     112           0 :             y1 = z(v1col,icol)
     113           0 :             y2 = z(v2col,icol)
     114           0 :             y3 = z(v3col,icol)
     115             : 
     116           0 :             sum1 = t11 * y1!+ 0   * y2!+ 0   * y3
     117           0 :             sum2 = t12 * y1 + t22 * y2!+ 0   * y3
     118           0 :             sum3 = t13 * y1 + t23 * y2 + t33 * y3
     119             : 
     120           0 :             do irow=1,m
     121           0 :                 a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3
     122             :             end do
     123             :         end do
     124           0 :     else if (k .eq. 4) then
     125           0 :             do icol=1,n
     126             :                 ! misusing variables for fetch of z parts
     127           0 :                 y1 = z(1,icol)
     128           0 :                 y2 = z(2,icol)
     129           0 :                 y3 = z(3,icol)
     130           0 :                 y4 = z(4,icol)
     131             : 
     132             :                 ! dtrmv like - starting from main diagonal and working
     133             :                 ! upwards
     134           0 :                 t11 = t(1,1)
     135           0 :                 t22 = t(2,2)
     136           0 :                 t33 = t(3,3)
     137           0 :                 t44 = t(4,4)
     138             : 
     139           0 :                 sum1 = t11 * y1
     140           0 :                 sum2 = t22 * y2
     141           0 :                 sum3 = t33 * y3
     142           0 :                 sum4 = t44 * y4
     143             : 
     144           0 :                 t11 = t(1,2)
     145           0 :                 t22 = t(2,3)
     146           0 :                 t33 = t(3,4)
     147             : 
     148           0 :                 sum1 = sum1 + t11 * y2
     149           0 :                 sum2 = sum2 + t22 * y3
     150           0 :                 sum3 = sum3 + t33 * y4
     151             : 
     152           0 :                 t11 = t(1,3)
     153           0 :                 t22 = t(2,4)
     154             : 
     155           0 :                 sum1 = sum1 + t11 * y3
     156           0 :                 sum2 = sum2 + t22 * y4
     157             : 
     158           0 :                 t11 = t(1,4)
     159           0 :                 sum1 = sum1 + t11 * y4
     160             : 
     161             :                 ! one column of V is calculated
     162             :                 ! time to calculate A - Y * V
     163           0 :                 do irow=1,m ! TODO: loop unrolling
     164           0 :                     y1 = v(irow,1)
     165           0 :                     y2 = v(irow,2)
     166           0 :                     y3 = v(irow,3)
     167           0 :                     y4 = v(irow,4)
     168             : 
     169           0 :                     a1 = a(irow,icol)
     170             : 
     171           0 :                     a1 = a1 - y1*sum1
     172           0 :                     a1 = a1 - y2*sum2
     173           0 :                     a1 = a1 - y3*sum3
     174           0 :                     a1 = a1 - y4*sum4
     175             : 
     176           0 :                     a(irow,icol) = a1
     177             :                 end do
     178             :             end do
     179             :     else
     180             :         ! reference implementation
     181             : #ifdef DOUBLE_PRECISION_REAL
     182             :             ! V' = T * Z'
     183           0 :             call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk8,t,ldt,z,ldz)
     184             :             ! A = A - Y * V'
     185           0 :             call dgemm("Notrans","Notrans",m,n,k,-1.0_rk8,v,ldv,z,ldz,1.0_rk8,a,lda)
     186             : #else
     187             :             ! V' = T * Z'
     188           0 :             call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk4,t,ldt,z,ldz)
     189             :             ! A = A - Y * V'
     190           0 :             call dgemm("Notrans","Notrans",m,n,k,-1.0_rk4,v,ldv,z,ldz,1.0_rk4,a,lda)
     191             : #endif
     192             :     end if
     193             : 
     194             : end subroutine
     195             : 
     196             : subroutine qr_pdlarft_merge_kernel_local_&
     197           0 : &PRECISION &
     198           0 : (oldk,k,t,ldt,yty,ldy)
     199             :     use precision
     200             :     implicit none
     201             : 
     202             :     ! input variables (local)
     203             :     integer(kind=ik) :: ldt,ldy
     204             :     real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
     205             : 
     206             :     ! input variables (global)
     207             :     integer(kind=ik) :: k,oldk
     208             : 
     209             :     ! output variables (global)
     210             : 
     211             :     ! local scalars
     212             :     integer(kind=ik) :: icol,leftk,rightk
     213             : 
     214             :     ! local scalars for optimized versions
     215             :     integer(kind=ik) :: irow
     216             :     real(kind=REAL_DATATYPE)    :: t11
     217             :     real(kind=REAL_DATATYPE)    :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8
     218             :     real(kind=REAL_DATATYPE)    :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08
     219             :     real(kind=REAL_DATATYPE)    :: final01,final02,final03,final04,final05,final06,final07,final08
     220             : 
     221           0 :     if (oldk .eq. 0) return ! nothing to be done
     222             : 
     223           0 :         leftk = k
     224           0 :         rightk = oldk
     225             : 
     226             :     ! optimized implementations:
     227           0 :     if (leftk .eq. 1) then
     228           0 :         do icol=1,rightk
     229             :             ! multiply inner products with right t matrix
     230             :             ! (dtrmv like)
     231           0 :             yty1 = yty(1,1)
     232           0 :             t11 = t(leftk+1,leftk+icol)
     233             : 
     234           0 :             reg01 = yty1 * t11
     235             : 
     236           0 :             do irow=2,icol
     237           0 :                 yty1 = yty(1,irow)
     238           0 :                 t11 = t(leftk+irow,leftk+icol)
     239             : 
     240           0 :                 reg01 = reg01 + yty1 * t11
     241             :             end do
     242             : 
     243             :             ! multiply intermediate results with left t matrix and store in final t
     244             :             ! matrix
     245           0 :             t11 = -t(1,1)
     246           0 :             final01 = t11 * reg01
     247           0 :             t(1,leftk+icol) = final01
     248             :         end do
     249             : 
     250             :         !print *,'efficient tmerge - leftk=1'
     251           0 :     else if (leftk .eq. 2) then
     252           0 :         do icol=1,rightk
     253             :             ! multiply inner products with right t matrix
     254             :             ! (dtrmv like)
     255           0 :             yty1 = yty(1,1)
     256           0 :             yty2 = yty(2,1)
     257             : 
     258           0 :             t11  = t(leftk+1,leftk+icol)
     259             : 
     260           0 :             reg01 = yty1 * t11
     261           0 :             reg02 = yty2 * t11
     262             : 
     263           0 :             do irow=2,icol
     264           0 :                 yty1 = yty(1,irow)
     265           0 :                 yty2 = yty(2,irow)
     266           0 :                 t11 = t(leftk+irow,leftk+icol)
     267             : 
     268           0 :                 reg01 = reg01 + yty1 * t11
     269           0 :                 reg02 = reg02 + yty2 * t11
     270             :             end do
     271             : 
     272             :             ! multiply intermediate results with left t matrix and store in final t
     273             :             ! matrix
     274           0 :             yty1 = -t(1,1)
     275           0 :             yty2 = -t(1,2)
     276           0 :             yty3 = -t(2,2)
     277             : 
     278           0 :             final01 = reg02 * yty2
     279           0 :             final02 = reg02 * yty3
     280             : 
     281           0 :             final01 = final01 + reg01 * yty1
     282             : 
     283           0 :             t(1,leftk+icol) = final01
     284           0 :             t(2,leftk+icol) = final02
     285             :         end do
     286             : 
     287             :         !print *,'efficient tmerge - leftk=2'
     288           0 :     else if (leftk .eq. 4) then
     289           0 :         do icol=1,rightk
     290             :             ! multiply inner products with right t matrix
     291             :             ! (dtrmv like)
     292           0 :             yty1 = yty(1,1)
     293           0 :             yty2 = yty(2,1)
     294           0 :             yty3 = yty(3,1)
     295           0 :             yty4 = yty(4,1)
     296             : 
     297           0 :             t11  = t(leftk+1,leftk+icol)
     298             : 
     299           0 :             reg01 = yty1 * t11
     300           0 :             reg02 = yty2 * t11
     301           0 :             reg03 = yty3 * t11
     302           0 :             reg04 = yty4 * t11
     303             : 
     304           0 :             do irow=2,icol
     305           0 :                 yty1 = yty(1,irow)
     306           0 :                 yty2 = yty(2,irow)
     307           0 :                 yty3 = yty(3,irow)
     308           0 :                 yty4 = yty(4,irow)
     309             : 
     310           0 :                 t11 = t(leftk+irow,leftk+icol)
     311             : 
     312           0 :                 reg01 = reg01 + yty1 * t11
     313           0 :                 reg02 = reg02 + yty2 * t11
     314           0 :                 reg03 = reg03 + yty3 * t11
     315           0 :                 reg04 = reg04 + yty4 * t11
     316             :             end do
     317             : 
     318             :             ! multiply intermediate results with left t matrix and store in final t
     319             :             ! matrix (start from diagonal and move upwards)
     320           0 :             yty1 = -t(1,1)
     321           0 :             yty2 = -t(2,2)
     322           0 :             yty3 = -t(3,3)
     323           0 :             yty4 = -t(4,4)
     324             : 
     325             :             ! main diagonal
     326           0 :             final01 = reg01 * yty1
     327           0 :             final02 = reg02 * yty2
     328           0 :             final03 = reg03 * yty3
     329           0 :             final04 = reg04 * yty4
     330             : 
     331             :             ! above main diagonal
     332           0 :             yty1 = -t(1,2)
     333           0 :             yty2 = -t(2,3)
     334           0 :             yty3 = -t(3,4)
     335             : 
     336           0 :             final01 = final01 + reg02 * yty1
     337           0 :             final02 = final02 + reg03 * yty2
     338           0 :             final03 = final03 + reg04 * yty3
     339             : 
     340             :             ! above first side diagonal
     341           0 :             yty1 = -t(1,3)
     342           0 :             yty2 = -t(2,4)
     343             : 
     344           0 :             final01 = final01 + reg03 * yty1
     345           0 :             final02 = final02 + reg04 * yty2
     346             : 
     347             :             ! above second side diagonal
     348           0 :             yty1 = -t(1,4)
     349             : 
     350           0 :             final01 = final01 + reg04 * yty1
     351             : 
     352             :             ! write back to final matrix
     353           0 :             t(1,leftk+icol) = final01
     354           0 :             t(2,leftk+icol) = final02
     355           0 :             t(3,leftk+icol) = final03
     356           0 :             t(4,leftk+icol) = final04
     357             :         end do
     358             : 
     359             :         !print *,'efficient tmerge - leftk=4'
     360           0 :     else if (leftk .eq. 8) then
     361           0 :         do icol=1,rightk
     362             :             ! multiply inner products with right t matrix
     363             :             ! (dtrmv like)
     364           0 :             yty1 = yty(1,1)
     365           0 :             yty2 = yty(2,1)
     366           0 :             yty3 = yty(3,1)
     367           0 :             yty4 = yty(4,1)
     368           0 :             yty5 = yty(5,1)
     369           0 :             yty6 = yty(6,1)
     370           0 :             yty7 = yty(7,1)
     371           0 :             yty8 = yty(8,1)
     372             : 
     373           0 :             t11  = t(leftk+1,leftk+icol)
     374             : 
     375           0 :             reg01 = yty1 * t11
     376           0 :             reg02 = yty2 * t11
     377           0 :             reg03 = yty3 * t11
     378           0 :             reg04 = yty4 * t11
     379           0 :             reg05 = yty5 * t11
     380           0 :             reg06 = yty6 * t11
     381           0 :             reg07 = yty7 * t11
     382           0 :             reg08 = yty8 * t11
     383             : 
     384           0 :             do irow=2,icol
     385           0 :                 yty1 = yty(1,irow)
     386           0 :                 yty2 = yty(2,irow)
     387           0 :                 yty3 = yty(3,irow)
     388           0 :                 yty4 = yty(4,irow)
     389           0 :                 yty5 = yty(5,irow)
     390           0 :                 yty6 = yty(6,irow)
     391           0 :                 yty7 = yty(7,irow)
     392           0 :                 yty8 = yty(8,irow)
     393             : 
     394           0 :                 t11 = t(leftk+irow,leftk+icol)
     395             : 
     396           0 :                 reg01 = reg01 + yty1 * t11
     397           0 :                 reg02 = reg02 + yty2 * t11
     398           0 :                 reg03 = reg03 + yty3 * t11
     399           0 :                 reg04 = reg04 + yty4 * t11
     400           0 :                 reg05 = reg05 + yty5 * t11
     401           0 :                 reg06 = reg06 + yty6 * t11
     402           0 :                 reg07 = reg07 + yty7 * t11
     403           0 :                 reg08 = reg08 + yty8 * t11
     404             :             end do
     405             : 
     406             :             ! multiply intermediate results with left t matrix and store in final t
     407             :             ! matrix (start from diagonal and move upwards)
     408           0 :             yty1 = -t(1,1)
     409           0 :             yty2 = -t(2,2)
     410           0 :             yty3 = -t(3,3)
     411           0 :             yty4 = -t(4,4)
     412           0 :             yty5 = -t(5,5)
     413           0 :             yty6 = -t(6,6)
     414           0 :             yty7 = -t(7,7)
     415           0 :             yty8 = -t(8,8)
     416             : 
     417             :             ! main diagonal
     418           0 :             final01 = reg01 * yty1
     419           0 :             final02 = reg02 * yty2
     420           0 :             final03 = reg03 * yty3
     421           0 :             final04 = reg04 * yty4
     422           0 :             final05 = reg05 * yty5
     423           0 :             final06 = reg06 * yty6
     424           0 :             final07 = reg07 * yty7
     425           0 :             final08 = reg08 * yty8
     426             : 
     427             :             ! above main diagonal
     428           0 :             yty1 = -t(1,2)
     429           0 :             yty2 = -t(2,3)
     430           0 :             yty3 = -t(3,4)
     431           0 :             yty4 = -t(4,5)
     432           0 :             yty5 = -t(5,6)
     433           0 :             yty6 = -t(6,7)
     434           0 :             yty7 = -t(7,8)
     435             : 
     436           0 :             final01 = final01 + reg02 * yty1
     437           0 :             final02 = final02 + reg03 * yty2
     438           0 :             final03 = final03 + reg04 * yty3
     439           0 :             final04 = final04 + reg05 * yty4
     440           0 :             final05 = final05 + reg06 * yty5
     441           0 :             final06 = final06 + reg07 * yty6
     442           0 :             final07 = final07 + reg08 * yty7
     443             : 
     444             :             ! above first side diagonal
     445           0 :             yty1 = -t(1,3)
     446           0 :             yty2 = -t(2,4)
     447           0 :             yty3 = -t(3,5)
     448           0 :             yty4 = -t(4,6)
     449           0 :             yty5 = -t(5,7)
     450           0 :             yty6 = -t(6,8)
     451             : 
     452           0 :             final01 = final01 + reg03 * yty1
     453           0 :             final02 = final02 + reg04 * yty2
     454           0 :             final03 = final03 + reg05 * yty3
     455           0 :             final04 = final04 + reg06 * yty4
     456           0 :             final05 = final05 + reg07 * yty5
     457           0 :             final06 = final06 + reg08 * yty6
     458             : 
     459             :             !above second side diagonal
     460             : 
     461           0 :             yty1 = -t(1,4)
     462           0 :             yty2 = -t(2,5)
     463           0 :             yty3 = -t(3,6)
     464           0 :             yty4 = -t(4,7)
     465           0 :             yty5 = -t(5,8)
     466             : 
     467           0 :             final01 = final01 + reg04 * yty1
     468           0 :             final02 = final02 + reg05 * yty2
     469           0 :             final03 = final03 + reg06 * yty3
     470           0 :             final04 = final04 + reg07 * yty4
     471           0 :             final05 = final05 + reg08 * yty5
     472             : 
     473             :             ! i think you got the idea by now
     474             : 
     475           0 :             yty1 = -t(1,5)
     476           0 :             yty2 = -t(2,6)
     477           0 :             yty3 = -t(3,7)
     478           0 :             yty4 = -t(4,8)
     479             : 
     480           0 :             final01 = final01 + reg05 * yty1
     481           0 :             final02 = final02 + reg06 * yty2
     482           0 :             final03 = final03 + reg07 * yty3
     483           0 :             final04 = final04 + reg08 * yty4
     484             : 
     485             :             ! .....
     486             : 
     487           0 :             yty1 = -t(1,6)
     488           0 :             yty2 = -t(2,7)
     489           0 :             yty3 = -t(3,8)
     490             : 
     491           0 :             final01 = final01 + reg06 * yty1
     492           0 :             final02 = final02 + reg07 * yty2
     493           0 :             final03 = final03 + reg08 * yty3
     494             : 
     495             :             ! .....
     496             : 
     497           0 :             yty1 = -t(1,7)
     498           0 :             yty2 = -t(2,8)
     499             : 
     500           0 :             final01 = final01 + reg07 * yty1
     501           0 :             final02 = final02 + reg08 * yty2
     502             : 
     503             :             ! .....
     504             : 
     505           0 :             yty1 = -t(1,8)
     506             : 
     507           0 :             final01 = final01 + reg08 * yty1
     508             : 
     509             :             ! write back to final matrix
     510           0 :             t(1,leftk+icol) = final01
     511           0 :             t(2,leftk+icol) = final02
     512           0 :             t(3,leftk+icol) = final03
     513           0 :             t(4,leftk+icol) = final04
     514           0 :             t(5,leftk+icol) = final05
     515           0 :             t(6,leftk+icol) = final06
     516           0 :             t(7,leftk+icol) = final07
     517           0 :             t(8,leftk+icol) = final08
     518             :         end do
     519             : 
     520             :         !print *,'efficient tmerge - leftk=8'
     521             :     else
     522             :         ! reference implementation
     523           0 :         do icol=1,rightk
     524           0 :             t(1:leftk,leftk+icol) = yty(1:leftk,icol)
     525             :         end do
     526             : #ifdef DOUBLE_PRECISION_REAL
     527             :         ! -T1 * Y1'*Y2
     528           0 :         call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk8,t(1,1),ldt,t(1,leftk+1),ldt)
     529             :         ! (-T1 * Y1'*Y2) * T2
     530           0 :         call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk8,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
     531             : #else
     532             :         ! -T1 * Y1'*Y2
     533           0 :         call strmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk4,t(1,1),ldt,t(1,leftk+1),ldt)
     534             :         ! (-T1 * Y1'*Y2) * T2
     535           0 :         call strmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk4,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
     536             : 
     537             : #endif
     538             :     end if
     539             : 
     540             : end subroutine
     541             : 
     542             : 
     543             : ! yty structure
     544             : ! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
     545             : !    0     Y2'*Y3  Y2'*Y4 ...
     546             : !    0        0    Y3'*Y4 ...
     547             : !    0        0       0   ...
     548             : 
     549             : subroutine qr_tmerge_set_kernel_&
     550           0 : &PRECISION &
     551           0 : (k,blocksize,t,ldt,yty,ldy)
     552             :     use precision
     553             :     implicit none
     554             : 
     555             :     ! input variables (local)
     556             :     integer(kind=ik) :: ldt,ldy
     557             :     real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
     558             : 
     559             :     ! input variables (global)
     560             :     integer(kind=ik) :: k,blocksize
     561             : 
     562             :     ! output variables (global)
     563             : 
     564             :     ! local scalars
     565             :     integer(kind=ik) :: nr_blocks,current_block
     566             :     integer(kind=ik) :: remainder,oldk
     567             :     integer(kind=ik) :: yty_column,toffset
     568             : 
     569           0 :     if (k .le. blocksize) return ! nothing to merge
     570             : 
     571           0 :     nr_blocks = k / blocksize
     572           0 :     remainder = k - nr_blocks*blocksize
     573             : 
     574             :         ! work in "negative" direction:
     575             :         ! start with latest T matrix part and add older ones
     576           0 :         toffset = 1
     577           0 :         yty_column = 1
     578             : 
     579           0 :         if (remainder .gt. 0) then
     580             :             call qr_pdlarft_merge_kernel_local_&
     581             :       &PRECISION &
     582           0 :             (blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
     583           0 :             current_block = 1
     584           0 :             oldk = remainder+blocksize
     585           0 :             yty_column =  yty_column + blocksize
     586             :         else
     587             :             call qr_pdlarft_merge_kernel_local_&
     588             :       &PRECISION &
     589           0 :       (blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
     590           0 :             current_block = 2
     591           0 :             oldk = 2*blocksize
     592           0 :             yty_column = yty_column + blocksize
     593             :         end if
     594             : 
     595           0 :         do while (current_block .lt. nr_blocks)
     596             :             call qr_pdlarft_merge_kernel_local_&
     597             :       &PRECISION &
     598           0 :             (blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
     599           0 :             current_block = current_block + 1
     600           0 :             oldk = oldk + blocksize
     601           0 :             yty_column = yty_column + blocksize
     602             :         end do
     603             : 
     604             : end subroutine
     605             : ! yty structure
     606             : ! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
     607             : !    0     Y2'*Y3  Y2'*Y4 ...
     608             : !    0        0    Y3'*Y4 ...
     609             : !    0        0       0   ...
     610             : 
     611             : subroutine qr_tmerge_tree_kernel_&
     612           0 : &PRECISION &
     613           0 : (k,blocksize,treeorder,t,ldt,yty,ldy)
     614             :     use precision
     615             :     implicit none
     616             : 
     617             :     ! input variables (local)
     618             :     integer(kind=ik) :: ldt,ldy
     619             :     real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
     620             : 
     621             :     ! input variables (global)
     622             :     integer(kind=ik) :: k,blocksize,treeorder
     623             : 
     624             :     ! output variables (global)
     625             : 
     626             :     ! local scalars
     627             :     integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks
     628             :     integer remainder,max_treeorder,remaining_size
     629             :     integer toffset,yty_column
     630             :     integer toffset_start,yty_column_start
     631             :     integer yty_end,total_remainder,yty_remainder
     632             : 
     633           0 :     if (treeorder .eq. 0) return ! no merging
     634             : 
     635           0 :     if (treeorder .eq. 1) then
     636             :         call qr_tmerge_set_kernel_&
     637             :   &PRECISION &
     638           0 :   (k,blocksize,t,ldt,yty,ldy)
     639           0 :         return
     640             :     end if
     641             : 
     642           0 :     nr_blocks = k / blocksize
     643           0 :     max_treeorder = min(nr_blocks,treeorder)
     644             : 
     645           0 :     if (max_treeorder .eq. 1) then
     646             :         call qr_tmerge_set_kernel_&
     647             :   &PRECISION &
     648           0 :         (k,blocksize,t,ldt,yty,ldy)
     649           0 :         return
     650             :     end if
     651             : 
     652             :         ! work in "negative" direction: from latest set to oldest set
     653             :         ! implementation differs from rev=0 version due to issues with
     654             :         ! calculating the remainder parts
     655             :         ! compared to the rev=0 version we split remainder parts directly from
     656             :         ! parts which can be easily merged in a recursive way
     657             : 
     658           0 :         yty_end = (k / blocksize) * blocksize
     659           0 :         if (yty_end .eq. k) then
     660           0 :             yty_end = yty_end - blocksize
     661             :         end if
     662             : 
     663             :         !print *,'tree',yty_end,k,blocksize
     664             : 
     665           0 :         yty_column_start = 1
     666           0 :         toffset_start = 1
     667             : 
     668             :         ! is there a remainder block?
     669           0 :         nr_blocks = k / blocksize
     670           0 :         remainder = k - nr_blocks * blocksize
     671           0 :         if (remainder .eq. 0) then
     672             :             !print *,'no initial remainder'
     673             : 
     674             :             ! set offsets to the very beginning as there is no remainder part
     675           0 :             yty_column_start = 1
     676           0 :             toffset_start = 1
     677           0 :             total_remainder = 0
     678           0 :             remaining_size = k
     679           0 :             yty_remainder = 0
     680             :         else
     681             :             !print *,'starting with initial remainder'
     682             :             ! select submatrix and make remainder block public
     683           0 :             yty_column_start = 1 + blocksize
     684           0 :             toffset_start = 1 + remainder
     685           0 :             total_remainder = remainder
     686           0 :             remaining_size = k - remainder
     687           0 :             yty_remainder = 1
     688             :         end if
     689             : 
     690             :         ! from now on it is a clean set of blocks with sizes of multiple of
     691             :         ! blocksize
     692             : 
     693           0 :         temp_blocksize = blocksize
     694             : 
     695             :         !-------------------------------
     696           0 :         do while (remaining_size .gt. 0)
     697           0 :             nr_blocks = remaining_size / temp_blocksize
     698           0 :             max_treeorder = min(nr_blocks,treeorder)
     699             : 
     700           0 :             if (max_treeorder .eq. 1) then
     701           0 :                 remainder = 0
     702           0 :                 nr_sets = 0
     703           0 :                 setsize = 0
     704             : 
     705           0 :                 if (yty_remainder .gt. 0) then
     706           0 :                     yty_column = yty_remainder
     707             :                     !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column
     708             :                     call qr_tmerge_set_kernel_&
     709             :         &PRECISION &
     710           0 :                     (k,temp_blocksize,t,ldt,yty(1,yty_column),ldy)
     711             :                 else
     712             :                     !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size
     713             :                 endif
     714             : 
     715           0 :                 remaining_size = 0
     716             : 
     717           0 :                 return ! done
     718             :             else
     719           0 :                 nr_sets = nr_blocks / max_treeorder
     720           0 :                 setsize = max_treeorder*temp_blocksize
     721           0 :                 remainder = remaining_size - nr_sets*setsize
     722             :             end if
     723             : 
     724           0 :             if (remainder .gt. 0) then
     725           0 :                 if (remainder .gt. temp_blocksize) then
     726           0 :                     toffset = toffset_start
     727           0 :                     yty_column = yty_column_start
     728             : 
     729             :                     !print *,'set merging', toffset, yty_column,remainder
     730             :                     call qr_tmerge_set_kernel_&
     731             :         &PRECISION &
     732           0 :                     (remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
     733           0 :                     if (total_remainder .gt. 0) then
     734             :                         ! merge with existing global remainder part
     735             :                         !print *,'single+set merging',yty_remainder,total_remainder,remainder
     736             :                         call qr_pdlarft_merge_kernel_local_&
     737             :       &PRECISION &
     738           0 :                         (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
     739           0 :                         yty_remainder = yty_remainder + remainder
     740           0 :                         toffset_start = toffset_start + remainder
     741             : 
     742             :                         !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start
     743             : 
     744           0 :                         yty_column_start = yty_column_start + remainder
     745             :                     else
     746             :                         ! create new remainder part
     747             :                         !print *,'new remainder+set',yty_remainder
     748           0 :                         yty_remainder = yty_column_start + remainder - temp_blocksize
     749           0 :                         yty_column_start = yty_column_start + remainder
     750           0 :                         toffset_start = toffset_start + remainder
     751             :                         !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start
     752             :                     end if
     753             : 
     754             :                 else
     755           0 :                     if (total_remainder .gt. 0) then
     756             :                         ! merge with existing global remainder part
     757             :                         !print *,'single merging',yty_remainder,total_remainder,remainder
     758             :                         call qr_pdlarft_merge_kernel_local_&
     759             :       &PRECISION &
     760           0 :                         (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
     761           0 :                         yty_remainder = yty_remainder + remainder
     762           0 :                         toffset_start = toffset_start + remainder
     763             : 
     764             :                         !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start
     765             : 
     766           0 :                         yty_column_start = yty_column_start + remainder
     767             :                     else
     768             :                         ! create new remainder part
     769             :                         !print *,'new remainder',yty_remainder
     770           0 :                         yty_remainder = yty_column_start
     771           0 :                         yty_column_start = yty_column_start + temp_blocksize
     772           0 :                         toffset_start = toffset_start + remainder
     773             :                         !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start
     774             :                     end if
     775             :                 end if
     776             : 
     777           0 :                 total_remainder = total_remainder + remainder
     778           0 :                 remaining_size = remaining_size - remainder
     779             :             end if
     780             : 
     781           0 :             current_set = 0
     782           0 :             do while (current_set .lt. nr_sets)
     783           0 :                 toffset = toffset_start + current_set * setsize
     784           0 :                 yty_column = yty_column_start + current_set * setsize
     785             : 
     786             :                 !print *,'recursive merging', toffset, yty_column,setsize
     787             :                 call qr_tmerge_set_kernel_&
     788             :     &PRECISION &
     789           0 :     (setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
     790             : 
     791           0 :                 current_set = current_set +  1
     792             :             end do
     793             : 
     794             :             !print *,'increasing blocksize', temp_blocksize, setsize
     795           0 :             yty_column_start = yty_column_start + (setsize - temp_blocksize)
     796           0 :             temp_blocksize = setsize
     797             :         end do
     798             : end subroutine
     799             : 
     800             : 
     801             : ! yty should not contain the inner products vi'*vi
     802             : 
     803             : subroutine qr_dlarft_kernel_&
     804           0 : &PRECISION &
     805           0 : (n,tau,yty,ldy,t,ldt)
     806             :     use precision
     807             :     implicit none
     808             : 
     809             :     ! input variables
     810             :     integer(kind=ik) :: n,ldy,ldt
     811             :     real(kind=REAL_DATATYPE)    :: tau(*),yty(ldy,*)
     812             : 
     813             :     ! output variables
     814             :     real(kind=REAL_DATATYPE)    :: t(ldt,*)
     815             : 
     816             :     ! local variables
     817             :     integer(kind=ik) :: icol
     818             : 
     819             :     ! DEBUG: clear buffer first
     820             :     !t(1:n,1:n) = 0.0d0
     821             : 
     822             :         ! T1 = tau1
     823             :         ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
     824             :         ! | 0           Tk-1                           |
     825           0 :         t(n,n) = tau(n)
     826           0 :         do icol=n-1,1,-1
     827           0 :             t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1)
     828             : #ifdef DOUBLE_PRECISION_REAL
     829           0 :             call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
     830             : #else
     831           0 :             call strmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
     832             : #endif
     833           0 :             t(icol,icol) = tau(icol)
     834             :         end do
     835           0 : end subroutine
     836             : 
     837             : ! vim: syntax=fortran

Generated by: LCOV version 1.12