LCOV - code coverage report
Current view: top level - src/elpa2/kernels - real_template.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 322 322 100.0 %
Date: 2018-01-10 09:29:53 Functions: 14 16 87.5 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !
      20             : !    More information can be found here:
      21             : !    http://elpa.mpcdf.mpg.de/
      22             : !
      23             : !    ELPA is free software: you can redistribute it and/or modify
      24             : !    it under the terms of the version 3 of the license of the
      25             : !    GNU Lesser General Public License as published by the Free
      26             : !    Software Foundation.
      27             : !
      28             : !    ELPA is distributed in the hope that it will be useful,
      29             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      30             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      31             : !    GNU Lesser General Public License for more details.
      32             : !
      33             : !    You should have received a copy of the GNU Lesser General Public License
      34             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      35             : !
      36             : !    ELPA reflects a substantial effort on the part of the original
      37             : !    ELPA consortium, and we ask you to respect the spirit of the
      38             : !    license that we chose: i.e., please contribute any changes you
      39             : !    may have back to the original ELPA library distribution, and keep
      40             : !    any derivatives of ELPA under the same license that we chose for
      41             : !    the original distribution, the GNU Lesser General Public License.
      42             : !
      43             : !
      44             : ! --------------------------------------------------------------------------------------------------
      45             : !
      46             : ! This file contains the compute intensive kernels for the Householder transformations.
      47             : ! It should be compiled with the highest possible optimization level.
      48             : !
      49             : ! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
      50             : !
      51             : ! Copyright of the original code rests with the authors inside the ELPA
      52             : ! consortium. The copyright of any additional modifications shall rest
      53             : ! with their original authors, but shall adhere to the licensing terms
      54             : ! distributed along with the original code in the file "COPYING".
      55             : !
      56             : ! --------------------------------------------------------------------------------------------------
      57             : #endif
      58             : 
      59             : ! the intel compiler creates a temp copy of array q!
      60             : ! this should be prevented if possible without using assumed size arrays
      61             :   subroutine double_hh_trafo_&
      62             :   &MATH_DATATYPE&
      63             :   &_generic_&
      64      622592 :   &PRECISION&
      65      622592 :   & (q, hh, nb, nq, ldq, ldh)
      66             : 
      67             :     use precision
      68             :     use iso_c_binding
      69             :     use elpa_abstract_impl
      70             :     implicit none
      71             : 
      72             :     !class(elpa_abstract_impl_t), intent(inout) :: obj
      73             :     integer(kind=ik), intent(in)            :: nb, nq, ldq, ldh
      74             : #ifdef USE_ASSUMED_SIZE
      75             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
      76             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
      77             : #else
      78             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+1)
      79             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
      80             : #endif
      81             : 
      82             :     real(kind=C_DATATYPE_KIND)                :: s
      83             :     integer(kind=ik)                        :: i
      84             : 
      85             : !    equivalence(q(1,1),q_complex(1,1))
      86             : 
      87             :     ! Safety only:
      88             : 
      89             : !    call obj%timer%start("kernel generic: double_hh_trafo_&
      90             : !                     &MATH_DATATYPE&
      91             : !                    &_generic" // &
      92             : !                    &PRECISION_SUFFIX &
      93             : !                    )
      94             : 
      95      622592 :     if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
      96             : 
      97             :     ! Calculate dot product of the two Householder vectors
      98             : 
      99      622592 :     s = hh(2,2)*1.0
     100    39223296 :     do i=3,nb
     101    38600704 :        s = s+hh(i,2)*hh(i-1,1)
     102             :     enddo
     103             : 
     104             :     ! Do the Householder transformations
     105             : 
     106             : #ifndef USE_ASSUMED_SIZE
     107             : !    ! assign real data to compplex pointer
     108             : !    call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
     109             : #endif
     110             :     ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
     111             : 
     112     2654208 :     do i=1,nq-8,12
     113             : 
     114             : #ifdef USE_ASSUMED_SIZE
     115             :        call hh_trafo_kernel_12_generic_&
     116             :        &PRECISION&
     117     1015808 :        & (q(i,1),hh, nb, ldq, ldh, s)
     118             : #else
     119             :        call hh_trafo_kernel_12_generic_&
     120             :        &PRECISION&
     121     1015808 :        & (q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
     122             : #endif
     123             : 
     124             :     enddo
     125             : 
     126             :     ! i > nq-8 now, i.e. at most 8 rows remain
     127             : 
     128      622592 :     if(nq-i+1 > 4) then
     129             : 
     130             : #ifdef USE_ASSUMED_SIZE
     131             :        call hh_trafo_kernel_8_generic_&
     132             :        &PRECISION&
     133       90112 :        & (q(i,1),hh, nb, ldq, ldh, s)
     134             : #else
     135             :        call hh_trafo_kernel_8_generic_&
     136             :        &PRECISION&
     137       90112 :        & (q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
     138             : #endif
     139             : 
     140      442368 :     else if(nq-i+1 > 0) then
     141             : 
     142             : #ifdef USE_ASSUMED_SIZE
     143             :        call hh_trafo_kernel_4_generic_&
     144             :        &PRECISION&
     145      196608 :        & (q(i,1),hh, nb, ldq, ldh, s)
     146             : #else
     147             :        call hh_trafo_kernel_4_generic_&
     148             :        &PRECISION&
     149      196608 :        & (q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
     150             : #endif
     151             : 
     152             :     endif
     153             : 
     154             : !    call obj%timer%stop("kernel generic: double_hh_trafo_&
     155             : !                     &MATH_DATATYPE&
     156             : !                    &_generic" // &
     157             : !                    &PRECISION_SUFFIX &
     158             : !                    )
     159             : 
     160      622592 :   end subroutine
     161             : 
     162             :   ! --------------------------------------------------------------------------------------------------
     163             :   ! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
     164             :   ! Please note that Q is declared complex*16 here.
     165             :   ! This is a hint for compilers that packed arithmetic can be used for Q
     166             :   ! (relevant for Intel SSE and BlueGene double hummer CPUs).
     167             :   ! --------------------------------------------------------------------------------------------------
     168             : 
     169             :   subroutine hh_trafo_kernel_12_generic_&
     170     2031616 :   &PRECISION&
     171     2031616 :   & (q, hh, nb, ldq, ldh, s)
     172             :     use precision
     173             :     implicit none
     174             :     integer(kind=ik), intent(in)    :: nb, ldq, ldh
     175             : #ifdef USE_ASSUMED_SIZE
     176             : #ifdef PACK_REAL_TO_COMPLEX
     177             :     complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
     178             : #else
     179             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
     180             : #endif
     181             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
     182             : #else
     183             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:)
     184             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,2)
     185             : #endif
     186             :     real(kind=C_DATATYPE_KIND), intent(in)    :: s
     187             : 
     188             : #ifdef PACK_REAL_TO_COMPLEX
     189             :     complex(kind=SPECIAL_COMPLEX_DATATYPE)    :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
     190             : #else
     191             :     real(kind=C_DATATYPE_KIND)                :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
     192             :                                        y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
     193             : #endif
     194             :     real(kind=C_DATATYPE_KIND)                   :: h1, h2, tau1, tau2
     195             :     integer(kind=ik)                :: i
     196             : 
     197             : 
     198             :     !call obj%timer%start("kernel generic: hh_trafo_kernel_12_generic" // &
     199             :     !                &PRECISION_SUFFIX &
     200             :     !                )
     201             : 
     202     2031616 :     x1  = q(1,2)
     203     2031616 :     x2  = q(2,2)
     204     2031616 :     x3  = q(3,2)
     205     2031616 :     x4  = q(4,2)
     206     2031616 :     x5  = q(5,2)
     207     2031616 :     x6  = q(6,2)
     208             : #ifndef PACK_REAL_TO_COMPLEX
     209     1015808 :     x7  = q(7,2)
     210     1015808 :     x8  = q(8,2)
     211     1015808 :     x9  = q(9,2)
     212     1015808 :     x10 = q(10,2)
     213     1015808 :     x11 = q(11,2)
     214     1015808 :     x12 = q(12,2)
     215             : #endif
     216             : 
     217     2031616 :     y1  = q(1 ,1) + q(1, 2)*hh(2,2)
     218     2031616 :     y2  = q(2 ,1) + q(2, 2)*hh(2,2)
     219     2031616 :     y3  = q(3 ,1) + q(3, 2)*hh(2,2)
     220     2031616 :     y4  = q(4 ,1) + q(4, 2)*hh(2,2)
     221     2031616 :     y5  = q(5 ,1) + q(5, 2)*hh(2,2)
     222     2031616 :     y6  = q(6 ,1) + q(6, 2)*hh(2,2)
     223             : #ifndef PACK_REAL_TO_COMPLEX
     224     1015808 :     y7  = q(7 ,1) + q(7, 2)*hh(2,2)
     225     1015808 :     y8  = q(8 ,1) + q(8, 2)*hh(2,2)
     226     1015808 :     y9  = q(9 ,1) + q(9, 2)*hh(2,2)
     227     1015808 :     y10 = q(10,1) + q(10,2)*hh(2,2)
     228     1015808 :     y11 = q(11,1) + q(11,2)*hh(2,2)
     229     1015808 :     y12 = q(12,1) + q(12,2)*hh(2,2)
     230             : #endif
     231             : 
     232             : #ifdef DOUBLE_PRECISION_REAL
     233             : #if defined(SSE_ALIGNED)
     234             :     !DEC$ VECTOR ALIGNED
     235             : #endif
     236             : #endif
     237             : 
     238   127991808 :     do i=3,nb
     239   125960192 :        h1  = hh(i-1,1)
     240   125960192 :        h2  = hh(i,2)
     241   125960192 :        x1  = x1 + q(1, i)*h1
     242   125960192 :        y1  = y1 + q(1, i)*h2
     243   125960192 :        x2  = x2 + q(2, i)*h1
     244   125960192 :        y2  = y2 + q(2, i)*h2
     245   125960192 :        x3  = x3 + q(3, i)*h1
     246   125960192 :        y3  = y3 + q(3, i)*h2
     247   125960192 :        x4  = x4 + q(4, i)*h1
     248   125960192 :        y4  = y4 + q(4, i)*h2
     249   125960192 :        x5  = x5 + q(5, i)*h1
     250   125960192 :        y5  = y5 + q(5, i)*h2
     251   125960192 :        x6  = x6 + q(6, i)*h1
     252   125960192 :        y6  = y6 + q(6, i)*h2
     253             : #ifndef PACK_REAL_TO_COMPLEX
     254    62980096 :        x7  = x7  + q(7, i)*h1
     255    62980096 :        y7  = y7  + q(7, i)*h2
     256    62980096 :        x8  = x8  + q(8, i)*h1
     257    62980096 :        y8  = y8  + q(8, i)*h2
     258    62980096 :        x9  = x9  + q(9, i)*h1
     259    62980096 :        y9  = y9  + q(9, i)*h2
     260    62980096 :        x10 = x10 + q(10,i)*h1
     261    62980096 :        y10 = y10 + q(10,i)*h2
     262    62980096 :        x11 = x11 + q(11,i)*h1
     263    62980096 :        y11 = y11 + q(11,i)*h2
     264    62980096 :        x12 = x12 + q(12,i)*h1
     265    62980096 :        y12 = y12 + q(12,i)*h2
     266             : #endif
     267             :     enddo
     268             : 
     269     2031616 :     x1  = x1  + q(1,nb+1)*hh(nb,1)
     270     2031616 :     x2  = x2  + q(2,nb+1)*hh(nb,1)
     271     2031616 :     x3  = x3  + q(3,nb+1)*hh(nb,1)
     272     2031616 :     x4  = x4  + q(4,nb+1)*hh(nb,1)
     273     2031616 :     x5  = x5  + q(5,nb+1)*hh(nb,1)
     274     2031616 :     x6  = x6  + q(6,nb+1)*hh(nb,1)
     275             : #ifndef PACK_REAL_TO_COMPLEX
     276     1015808 :     x7  = x7  + q(7, nb+1)*hh(nb,1)
     277     1015808 :     x8  = x8  + q(8, nb+1)*hh(nb,1)
     278     1015808 :     x9  = x9  + q(9, nb+1)*hh(nb,1)
     279     1015808 :     x10 = x10 + q(10,nb+1)*hh(nb,1)
     280     1015808 :     x11 = x11 + q(11,nb+1)*hh(nb,1)
     281     1015808 :     x12 = x12 + q(12,nb+1)*hh(nb,1)
     282             : 
     283             : #endif
     284             : 
     285     2031616 :     tau1 = hh(1,1)
     286     2031616 :     tau2 = hh(1,2)
     287             : 
     288     2031616 :     h1  = -tau1
     289     2031616 :     x1  = x1 *h1
     290     2031616 :     x2  = x2 *h1
     291     2031616 :     x3  = x3 *h1
     292     2031616 :     x4  = x4 *h1
     293     2031616 :     x5  = x5 *h1
     294     2031616 :     x6  = x6 *h1
     295             : #ifndef PACK_REAL_TO_COMPLEX
     296     1015808 :     x7  = x7 *h1
     297     1015808 :     x8  = x8 *h1
     298     1015808 :     x9  = x9 *h1
     299     1015808 :     x10 = x10*h1
     300     1015808 :     x11 = x11*h1
     301     1015808 :     x12 = x12*h1
     302             : #endif
     303             : 
     304     2031616 :     h1  = -tau2
     305     2031616 :     h2  = -tau2*s
     306     2031616 :     y1  = y1 *h1 + x1 *h2
     307     2031616 :     y2  = y2 *h1 + x2 *h2
     308     2031616 :     y3  = y3 *h1 + x3 *h2
     309     2031616 :     y4  = y4 *h1 + x4 *h2
     310     2031616 :     y5  = y5 *h1 + x5 *h2
     311     2031616 :     y6  = y6 *h1 + x6 *h2
     312             : #ifndef PACK_REAL_TO_COMPLEX
     313     1015808 :     y7  = y7 *h1 + x7 *h2
     314     1015808 :     y8  = y8 *h1 + x8 *h2
     315     1015808 :     y9  = y9 *h1 + x9 *h2
     316     1015808 :     y10 = y10*h1 + x10*h2
     317     1015808 :     y11 = y11*h1 + x11*h2
     318     1015808 :     y12 = y12*h1 + x12*h2
     319             : #endif
     320     2031616 :     q(1,1)  = q(1, 1) + y1
     321     2031616 :     q(2,1)  = q(2, 1) + y2
     322     2031616 :     q(3,1)  = q(3, 1) + y3
     323     2031616 :     q(4,1)  = q(4, 1) + y4
     324     2031616 :     q(5,1)  = q(5, 1) + y5
     325     2031616 :     q(6,1)  = q(6, 1) + y6
     326             : #ifndef PACK_REAL_TO_COMPLEX
     327     1015808 :     q(7 ,1) = q(7, 1) + y7
     328     1015808 :     q(8 ,1) = q(8, 1) + y8
     329     1015808 :     q(9 ,1) = q(9, 1) + y9
     330     1015808 :     q(10,1) = q(10,1) + y10
     331     1015808 :     q(11,1) = q(11,1) + y11
     332     1015808 :     q(12,1) = q(12,1) + y12
     333             : #endif
     334             : 
     335     2031616 :     q(1, 2) = q(1, 2) + x1  + y1 *hh(2,2)
     336     2031616 :     q(2, 2) = q(2, 2) + x2  + y2 *hh(2,2)
     337     2031616 :     q(3, 2) = q(3, 2) + x3  + y3 *hh(2,2)
     338     2031616 :     q(4, 2) = q(4, 2) + x4  + y4 *hh(2,2)
     339     2031616 :     q(5, 2) = q(5, 2) + x5  + y5 *hh(2,2)
     340     2031616 :     q(6, 2) = q(6, 2) + x6  + y6 *hh(2,2)
     341             : #ifndef PACK_REAL_TO_COMPLEX
     342     1015808 :     q(7, 2) = q(7, 2) + x7  + y7 *hh(2,2)
     343     1015808 :     q(8, 2) = q(8, 2) + x8  + y8 *hh(2,2)
     344     1015808 :     q(9, 2) = q(9, 2) + x9  + y9 *hh(2,2)
     345     1015808 :     q(10,2) = q(10,2) + x10 + y10*hh(2,2)
     346     1015808 :     q(11,2) = q(11,2) + x11 + y11*hh(2,2)
     347     1015808 :     q(12,2) = q(12,2) + x12 + y12*hh(2,2)
     348             : #endif
     349             : 
     350             : #ifdef DOUBLE_PRECISION_REAL
     351             : #if defined(SSE_ALIGNED)
     352             :     !DEC$ VECTOR ALIGNED
     353             : #endif
     354             : #endif
     355             : 
     356   127991808 :     do i=3,nb
     357   125960192 :        h1 = hh(i-1,1)
     358   125960192 :        h2 = hh(i,2)
     359   125960192 :        q(1, i) = q(1,i)  + x1 *h1 + y1 *h2
     360   125960192 :        q(2, i) = q(2,i)  + x2 *h1 + y2 *h2
     361   125960192 :        q(3, i) = q(3,i)  + x3 *h1 + y3 *h2
     362   125960192 :        q(4, i) = q(4,i)  + x4 *h1 + y4 *h2
     363   125960192 :        q(5, i) = q(5,i)  + x5 *h1 + y5 *h2
     364   125960192 :        q(6, i) = q(6,i)  + x6 *h1 + y6 *h2
     365             : #ifndef PACK_REAL_TO_COMPLEX
     366    62980096 :        q(7, i) = q(7, i) + x7 *h1 + y7 *h2
     367    62980096 :        q(8, i) = q(8, i) + x8 *h1 + y8 *h2
     368    62980096 :        q(9, i) = q(9, i) + x9 *h1 + y9 *h2
     369    62980096 :        q(10,i) = q(10,i) + x10*h1 + y10*h2
     370    62980096 :        q(11,i) = q(11,i) + x11*h1 + y11*h2
     371    62980096 :        q(12,i) = q(12,i) + x12*h1 + y12*h2
     372             : #endif
     373             :     enddo
     374             : 
     375     2031616 :     q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
     376     2031616 :     q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
     377     2031616 :     q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
     378     2031616 :     q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
     379     2031616 :     q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
     380     2031616 :     q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
     381             : #ifndef PACK_REAL_TO_COMPLEX
     382     1015808 :     q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
     383     1015808 :     q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
     384     1015808 :     q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
     385     1015808 :     q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
     386     1015808 :     q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
     387     1015808 :     q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
     388             : #endif
     389             : 
     390             : !    call obj%timer%stop("kernel generic: hh_trafo_kernel_12_generic" // &
     391             : !                    &PRECISION_SUFFIX &
     392             : !                    )
     393             : 
     394     2031616 :   end subroutine
     395             :   ! --------------------------------------------------------------------------------------------------
     396             : 
     397             :   subroutine hh_trafo_kernel_8_generic_&
     398      180224 :   &PRECISION&
     399      180224 :   & (q, hh, nb, ldq, ldh, s)
     400             :     use precision
     401             :     implicit none
     402             :     integer(kind=ik), intent(in)     :: nb, ldq, ldh
     403             : #ifdef USE_ASSUMED_SIZE
     404             : #ifdef PACK_REAL_TO_COMPLEX
     405             :     complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout)  :: q(ldq/2,*)
     406             : #else
     407             :     real(kind=C_DATATYPE_KIND), intent(inout)  :: q(ldq,*)
     408             : #endif
     409             :     real(kind=C_DATATYPE_KIND), intent(in)     :: hh(ldh,*)
     410             : #else
     411             :     real(kind=C_DATATYPE_KIND),   intent(inout):: q(:,:)
     412             :     real(kind=C_DATATYPE_KIND), intent(in)     :: hh(ldh,2)
     413             : #endif
     414             :     real(kind=C_DATATYPE_KIND), intent(in)     :: s
     415             : #ifdef PACK_REAL_TO_COMPLEX
     416             :     complex(kind=SPECIAL_COMPLEX_DATATYPE)     :: x1, x2, x3, x4, y1, y2, y3, y4
     417             : #else
     418             :     real(kind=C_DATATYPE_KIND)                 :: x1, x2, x3, x4, x5, x6, x7, x8, &
     419             :                                         y1, y2, y3, y4, y5, y6, y7, y8
     420             : #endif
     421             :     real(kind=C_DATATYPE_KIND)                    :: h1, h2, tau1, tau2
     422             :     integer(kind=ik)                 :: i
     423             : 
     424             : !    call obj%timer%start("kernel generic: hh_trafo_kernel_8_generic" // &
     425             : !                    &PRECISION_SUFFIX &
     426             : !                    )
     427      180224 :     x1 = q(1,2)
     428      180224 :     x2 = q(2,2)
     429      180224 :     x3 = q(3,2)
     430      180224 :     x4 = q(4,2)
     431             : #ifndef PACK_REAL_TO_COMPLEX
     432       90112 :     x5 = q(5,2)
     433       90112 :     x6 = q(6,2)
     434       90112 :     x7 = q(7,2)
     435       90112 :     x8 = q(8,2)
     436             : #endif
     437             : 
     438      180224 :     y1 = q(1,1) + q(1,2)*hh(2,2)
     439      180224 :     y2 = q(2,1) + q(2,2)*hh(2,2)
     440      180224 :     y3 = q(3,1) + q(3,2)*hh(2,2)
     441      180224 :     y4 = q(4,1) + q(4,2)*hh(2,2)
     442             : #ifndef PACK_REAL_TO_COMPLEX
     443       90112 :     y5 = q(5,1) + q(5,2)*hh(2,2)
     444       90112 :     y6 = q(6,1) + q(6,2)*hh(2,2)
     445       90112 :     y7 = q(7,1) + q(7,2)*hh(2,2)
     446       90112 :     y8 = q(8,1) + q(8,2)*hh(2,2)
     447             : #endif
     448             : 
     449             : #ifdef DOUBLE_PRECISION_REAL
     450             : #if defined(SSE_ALIGNED)
     451             :     !DEC$ VECTOR ALIGNED
     452             : #endif
     453             : #endif
     454             : 
     455    11354112 :     do i=3,nb
     456    11173888 :        h1 = hh(i-1,1)
     457    11173888 :        h2 = hh(i,2)
     458    11173888 :        x1 = x1 + q(1,i)*h1
     459    11173888 :        y1 = y1 + q(1,i)*h2
     460    11173888 :        x2 = x2 + q(2,i)*h1
     461    11173888 :        y2 = y2 + q(2,i)*h2
     462    11173888 :        x3 = x3 + q(3,i)*h1
     463    11173888 :        y3 = y3 + q(3,i)*h2
     464    11173888 :        x4 = x4 + q(4,i)*h1
     465    11173888 :        y4 = y4 + q(4,i)*h2
     466             : #ifndef PACK_REAL_TO_COMPLEX
     467     5586944 :        x5 = x5 + q(5,i)*h1
     468     5586944 :        y5 = y5 + q(5,i)*h2
     469     5586944 :        x6 = x6 + q(6,i)*h1
     470     5586944 :        y6 = y6 + q(6,i)*h2
     471     5586944 :        x7 = x7 + q(7,i)*h1
     472     5586944 :        y7 = y7 + q(7,i)*h2
     473     5586944 :        x8 = x8 + q(8,i)*h1
     474     5586944 :        y8 = y8 + q(8,i)*h2
     475             : #endif
     476             :     enddo
     477             : 
     478      180224 :     x1 = x1 + q(1,nb+1)*hh(nb,1)
     479      180224 :     x2 = x2 + q(2,nb+1)*hh(nb,1)
     480      180224 :     x3 = x3 + q(3,nb+1)*hh(nb,1)
     481      180224 :     x4 = x4 + q(4,nb+1)*hh(nb,1)
     482             : #ifndef PACK_REAL_TO_COMPLEX
     483       90112 :     x5 = x5 + q(5,nb+1)*hh(nb,1)
     484       90112 :     x6 = x6 + q(6,nb+1)*hh(nb,1)
     485       90112 :     x7 = x7 + q(7,nb+1)*hh(nb,1)
     486       90112 :     x8 = x8 + q(8,nb+1)*hh(nb,1)
     487             : #endif
     488             : 
     489      180224 :     tau1 = hh(1,1)
     490      180224 :     tau2 = hh(1,2)
     491             : 
     492      180224 :     h1 = -tau1
     493      180224 :     x1 = x1*h1
     494      180224 :     x2 = x2*h1
     495      180224 :     x3 = x3*h1
     496      180224 :     x4 = x4*h1
     497             : #ifndef PACK_REAL_TO_COMPLEX
     498       90112 :     x5 = x5*h1
     499       90112 :     x6 = x6*h1
     500       90112 :     x7 = x7*h1
     501       90112 :     x8 = x8*h1
     502             : #endif
     503      180224 :     h1 = -tau2
     504      180224 :     h2 = -tau2*s
     505      180224 :     y1 = y1*h1 + x1*h2
     506      180224 :     y2 = y2*h1 + x2*h2
     507      180224 :     y3 = y3*h1 + x3*h2
     508      180224 :     y4 = y4*h1 + x4*h2
     509             : #ifndef PACK_REAL_TO_COMPLEX
     510       90112 :     y5 = y5*h1 + x5*h2
     511       90112 :     y6 = y6*h1 + x6*h2
     512       90112 :     y7 = y7*h1 + x7*h2
     513       90112 :     y8 = y8*h1 + x8*h2
     514             : #endif
     515      180224 :     q(1,1) = q(1,1) + y1
     516      180224 :     q(2,1) = q(2,1) + y2
     517      180224 :     q(3,1) = q(3,1) + y3
     518      180224 :     q(4,1) = q(4,1) + y4
     519             : #ifndef PACK_REAL_TO_COMPLEX
     520       90112 :     q(5,1) = q(5,1) + y5
     521       90112 :     q(6,1) = q(6,1) + y6
     522       90112 :     q(7,1) = q(7,1) + y7
     523       90112 :     q(8,1) = q(8,1) + y8
     524             : #endif
     525      180224 :     q(1,2) = q(1,2) + x1 + y1*hh(2,2)
     526      180224 :     q(2,2) = q(2,2) + x2 + y2*hh(2,2)
     527      180224 :     q(3,2) = q(3,2) + x3 + y3*hh(2,2)
     528      180224 :     q(4,2) = q(4,2) + x4 + y4*hh(2,2)
     529             : #ifndef PACK_REAL_TO_COMPLEX
     530       90112 :     q(5,2) = q(5,2) + x5 + y5*hh(2,2)
     531       90112 :     q(6,2) = q(6,2) + x6 + y6*hh(2,2)
     532       90112 :     q(7,2) = q(7,2) + x7 + y7*hh(2,2)
     533       90112 :     q(8,2) = q(8,2) + x8 + y8*hh(2,2)
     534             : #endif
     535             : 
     536             : #ifdef DOUBLE_PRECISION_REAL
     537             : #if defined(SSE_ALIGNED)
     538             :     !DEC$ VECTOR ALIGNED
     539             : #endif
     540             : #endif
     541             : 
     542    11354112 :     do i=3,nb
     543    11173888 :        h1 = hh(i-1,1)
     544    11173888 :        h2 = hh(i,2)
     545    11173888 :        q(1,i) = q(1,i) + x1*h1 + y1*h2
     546    11173888 :        q(2,i) = q(2,i) + x2*h1 + y2*h2
     547    11173888 :        q(3,i) = q(3,i) + x3*h1 + y3*h2
     548    11173888 :        q(4,i) = q(4,i) + x4*h1 + y4*h2
     549             : #ifndef PACK_REAL_TO_COMPLEX
     550     5586944 :        q(5,i) = q(5,i) + x5*h1 + y5*h2
     551     5586944 :        q(6,i) = q(6,i) + x6*h1 + y6*h2
     552     5586944 :        q(7,i) = q(7,i) + x7*h1 + y7*h2
     553     5586944 :        q(8,i) = q(8,i) + x8*h1 + y8*h2
     554             : #endif
     555             :     enddo
     556             : 
     557      180224 :     q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
     558      180224 :     q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
     559      180224 :     q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
     560      180224 :     q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
     561             : #ifndef PACK_REAL_TO_COMPLEX
     562       90112 :     q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
     563       90112 :     q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
     564       90112 :     q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
     565       90112 :     q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
     566             : #endif
     567             : 
     568             : 
     569             : !    call obj%timer%stop("kernel generic: hh_trafo_kernel_8_generic" // &
     570             : !                    &PRECISION_SUFFIX &
     571             : !                    )
     572             : 
     573      180224 :   end subroutine
     574             :   ! --------------------------------------------------------------------------------------------------
     575             : 
     576             :   subroutine hh_trafo_kernel_4_generic_&
     577      393216 :   &PRECISION&
     578      393216 :   & (q, hh, nb, ldq, ldh, s)
     579             : 
     580             :     use precision
     581             :     implicit none
     582             :     integer(kind=ik), intent(in)    :: nb, ldq, ldh
     583             : #ifdef USE_ASSUMED_SIZE
     584             : #ifdef PACK_REAL_TO_COMPLEX
     585             :     complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
     586             : #else
     587             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
     588             : #endif
     589             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
     590             : #else
     591             :     real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1)
     592             :     real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,2)
     593             : #endif
     594             :     real(kind=C_DATATYPE_KIND), intent(in)    :: s
     595             : 
     596             : #ifdef PACK_REAL_TO_COMPLEX
     597             :     complex(kind=SPECIAL_COMPLEX_DATATYPE)    :: x1, x2, y1, y2
     598             : #else
     599             :     real(kind=C_DATATYPE_KIND)                :: x1, x2, x3, x4, y1, y2, y3, y4
     600             : #endif
     601             :     real(kind=C_DATATYPE_KIND)                :: h1, h2, tau1, tau2
     602             :     integer(kind=ik)                :: i
     603             : 
     604             : !    call obj%timer%start("kernel generic: hh_trafo_kernel_4_generic" // &
     605             : !                    &PRECISION_SUFFIX &
     606             : !                    )
     607      393216 :     x1 = q(1,2)
     608      393216 :     x2 = q(2,2)
     609             : #ifndef PACK_REAL_TO_COMPLEX
     610      196608 :     x3 = q(3,2)
     611      196608 :     x4 = q(4,2)
     612             : #endif
     613             : 
     614      393216 :     y1 = q(1,1) + q(1,2)*hh(2,2)
     615      393216 :     y2 = q(2,1) + q(2,2)*hh(2,2)
     616             : #ifndef PACK_REAL_TO_COMPLEX
     617      196608 :     y3 = q(3,1) + q(3,2)*hh(2,2)
     618      196608 :     y4 = q(4,1) + q(4,2)*hh(2,2)
     619             : #endif
     620             : 
     621             : #ifdef DOUBLE_PRECISION_REAL
     622             : #if defined(SSE_ALIGNED)
     623             :     !DEC$ VECTOR ALIGNED
     624             : #endif
     625             : #endif
     626             : 
     627    24772608 :     do i=3,nb
     628    24379392 :        h1 = hh(i-1,1)
     629    24379392 :        h2 = hh(i,2)
     630    24379392 :        x1 = x1 + q(1,i)*h1
     631    24379392 :        y1 = y1 + q(1,i)*h2
     632    24379392 :        x2 = x2 + q(2,i)*h1
     633    24379392 :        y2 = y2 + q(2,i)*h2
     634             : #ifndef PACK_REAL_TO_COMPLEX
     635    12189696 :        x3 = x3 + q(3,i)*h1
     636    12189696 :        y3 = y3 + q(3,i)*h2
     637    12189696 :        x4 = x4 + q(4,i)*h1
     638    12189696 :        y4 = y4 + q(4,i)*h2
     639             : #endif
     640             :     enddo
     641             : 
     642      393216 :     x1 = x1 + q(1,nb+1)*hh(nb,1)
     643      393216 :     x2 = x2 + q(2,nb+1)*hh(nb,1)
     644             : #ifndef PACK_REAL_TO_COMPLEX
     645      196608 :     x3 = x3 + q(3,nb+1)*hh(nb,1)
     646      196608 :     x4 = x4 + q(4,nb+1)*hh(nb,1)
     647             : #endif
     648             : 
     649      393216 :     tau1 = hh(1,1)
     650      393216 :     tau2 = hh(1,2)
     651             : 
     652      393216 :     h1 = -tau1
     653      393216 :     x1 = x1*h1
     654      393216 :     x2 = x2*h1
     655             : #ifndef PACK_REAL_TO_COMPLEX
     656      196608 :     x3 = x3*h1
     657      196608 :     x4 = x4*h1
     658             : #endif
     659      393216 :     h1 = -tau2
     660      393216 :     h2 = -tau2*s
     661      393216 :     y1 = y1*h1 + x1*h2
     662      393216 :     y2 = y2*h1 + x2*h2
     663             : #ifndef PACK_REAL_TO_COMPLEX
     664      196608 :     y3 = y3*h1 + x3*h2
     665      196608 :     y4 = y4*h1 + x4*h2
     666             : #endif
     667             : 
     668      393216 :     q(1,1) = q(1,1) + y1
     669      393216 :     q(2,1) = q(2,1) + y2
     670             : #ifndef PACK_REAL_TO_COMPLEX
     671      196608 :     q(3,1) = q(3,1) + y3
     672      196608 :     q(4,1) = q(4,1) + y4
     673             : #endif
     674      393216 :     q(1,2) = q(1,2) + x1 + y1*hh(2,2)
     675      393216 :     q(2,2) = q(2,2) + x2 + y2*hh(2,2)
     676             : #ifndef PACK_REAL_TO_COMPLEX
     677      196608 :     q(3,2) = q(3,2) + x3 + y3*hh(2,2)
     678      196608 :     q(4,2) = q(4,2) + x4 + y4*hh(2,2)
     679             : #endif
     680             : 
     681             : #ifdef DOUBLE_PRECISION_REAL
     682             : #if defined(SSE_ALIGNED)
     683             :     !DEC$ VECTOR ALIGNED
     684             : #endif
     685             : #endif
     686    24772608 :     do i=3,nb
     687    24379392 :        h1 = hh(i-1,1)
     688    24379392 :        h2 = hh(i,2)
     689    24379392 :        q(1,i) = q(1,i) + x1*h1 + y1*h2
     690    24379392 :        q(2,i) = q(2,i) + x2*h1 + y2*h2
     691             : #ifndef PACK_REAL_TO_COMPLEX
     692    12189696 :        q(3,i) = q(3,i) + x3*h1 + y3*h2
     693    12189696 :        q(4,i) = q(4,i) + x4*h1 + y4*h2
     694             : #endif
     695             :     enddo
     696             : 
     697      393216 :     q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
     698      393216 :     q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
     699             : #ifndef PACK_REAL_TO_COMPLEX
     700      196608 :     q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
     701      196608 :     q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
     702             : #endif
     703             : 
     704             : !    call obj%timer%stop("kernel generic: hh_trafo_kernel_4_generic" // &
     705             : !                    &PRECISION_SUFFIX &
     706             : !                    )
     707             : 
     708      393216 :   end subroutine

Generated by: LCOV version 1.12