LCOV - code coverage report
Current view: top level - src/elpa2 - compute_hh_trafo.F90 (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 270 300 90.0 %
Date: 2018-01-10 09:29:53 Functions: 8 8 100.0 %

          Line data    Source code
       1             : #if 0
       2             : !    This file is part of ELPA.
       3             : !
       4             : !    The ELPA library was originally created by the ELPA consortium,
       5             : !    consisting of the following organizations:
       6             : !
       7             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       8             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       9             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
      10             : !      Informatik,
      11             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      12             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      13             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      14             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      15             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      16             : !      and
      17             : !    - IBM Deutschland GmbH
      18             : !
      19             : !
      20             : !    More information can be found here:
      21             : !    http://elpa.mpcdf.mpg.de/
      22             : !
      23             : !    ELPA is free software: you can redistribute it and/or modify
      24             : !    it under the terms of the version 3 of the license of the
      25             : !    GNU Lesser General Public License as published by the Free
      26             : !    Software Foundation.
      27             : !
      28             : !    ELPA is distributed in the hope that it will be useful,
      29             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      30             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      31             : !    GNU Lesser General Public License for more details.
      32             : !
      33             : !    You should have received a copy of the GNU Lesser General Public License
      34             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      35             : !
      36             : !    ELPA reflects a substantial effort on the part of the original
      37             : !    ELPA consortium, and we ask you to respect the spirit of the
      38             : !    license that we chose: i.e., please contribute any changes you
      39             : !    may have back to the original ELPA library distribution, and keep
      40             : !    any derivatives of ELPA under the same license that we chose for
      41             : !    the original distribution, the GNU Lesser General Public License.
      42             : !
      43             : ! This file was written by A. Marek, MPCDF
      44             : #endif
      45             : 
      46             :        subroutine compute_hh_trafo_&
      47             :        &MATH_DATATYPE&
      48             : #ifdef WITH_OPENMP
      49             :        &_openmp_&
      50             : #else
      51             :        &_&
      52             : #endif
      53     3491712 :        &PRECISION &
      54             :        (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count,  &
      55             : #ifdef WITH_OPENMP
      56             :        max_threads, l_nev, &
      57             : #endif
      58     3491712 :        a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
      59             : #if REALCASE == 1
      60             :        hh_dot_dev, &
      61             : #endif
      62             :        hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
      63             : #ifdef WITH_OPENMP
      64             :        my_thread, thread_width, &
      65             : #else
      66             :        last_stripe_width, &
      67             : #endif
      68             :        kernel)
      69             : 
      70             :          use precision
      71             :          use elpa_abstract_impl
      72             :          use iso_c_binding
      73             : #if REALCASE == 1
      74             : 
      75             :          use single_hh_trafo_real
      76             : #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
      77             :          use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
      78             : #endif
      79             : 
      80             : #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
      81             :          use real_generic_kernel !, only : double_hh_trafo_generic
      82             : #endif
      83             : 
      84             : #if defined(WITH_REAL_BGP_KERNEL)
      85             :          use real_bgp_kernel !, only : double_hh_trafo_bgp
      86             : #endif
      87             : 
      88             : #if defined(WITH_REAL_BGQ_KERNEL)
      89             :          use real_bgq_kernel !, only : double_hh_trafo_bgq
      90             : #endif
      91             : 
      92             : #endif /* REALCASE */
      93             : 
      94             : #if COMPLEXCASE == 1
      95             : 
      96             : #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
      97             :            use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
      98             : #endif
      99             : #if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
     100             :            use complex_generic_kernel !, only : single_hh_trafo_complex_generic
     101             : #endif
     102             : 
     103             : #endif /* COMPLEXCASE */
     104             : 
     105             :          use cuda_c_kernel
     106             :          use cuda_functions
     107             : 
     108             :          use elpa_generated_fortran_interfaces
     109             : 
     110             :          implicit none
     111             :          class(elpa_abstract_impl_t), intent(inout) :: obj
     112             :          logical, intent(in)                        :: useGPU, wantDebug
     113             :          real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
     114             :          integer(kind=lik)                          :: kernel_flops
     115             :          integer(kind=ik), intent(in)               :: nbw, max_blk_size
     116             : #if REALCASE == 1
     117             :          real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
     118             : #endif
     119             : #if COMPLEXCASE == 1
     120             :          complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
     121             : #endif
     122             :          integer(kind=ik), intent(in)               :: a_off
     123             : 
     124             :          integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
     125             : 
     126             : #ifndef WITH_OPENMP
     127             :          integer(kind=ik), intent(in)               :: last_stripe_width
     128             : #if REALCASE == 1
     129             : !         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
     130             :          real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
     131             : #endif
     132             : #if COMPLEXCASE == 1
     133             : !          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
     134             :           complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
     135             : #endif
     136             : 
     137             : #else /* WITH_OPENMP */
     138             :          integer(kind=ik), intent(in)               :: max_threads, l_nev, thread_width
     139             : #if REALCASE == 1
     140             : !         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
     141             :          real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
     142             : #endif
     143             : #if COMPLEXCASE == 1
     144             : !          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
     145             :           complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
     146             : #endif
     147             : 
     148             : #endif /* WITH_OPENMP */
     149             : 
     150             :          integer(kind=ik), intent(in)               :: kernel
     151             : 
     152             :          integer(kind=c_intptr_t)                   :: a_dev
     153             :    integer(kind=c_intptr_t)                         :: bcast_buffer_dev
     154             : #if REALCASE == 1
     155             :          integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
     156             : #endif
     157             :          integer(kind=c_intptr_t)                   :: hh_tau_dev
     158             :          integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
     159             : 
     160             :          ! Private variables in OMP regions (my_thread) should better be in the argument list!
     161             :          integer(kind=ik)                           :: off, ncols, istripe
     162             : #ifdef WITH_OPENMP
     163             :          integer(kind=ik)                           :: my_thread, noff
     164             : #endif
     165             :          integer(kind=ik)                           :: j, nl, jj, jjj, n_times
     166             : #if REALCASE == 1
     167     2588544 :          real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
     168             : #endif
     169             : #if COMPLEXCASE == 1
     170     4394880 :          complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
     171             : #endif
     172             :          real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
     173             : 
     174             : 
     175     3491712 :          j = -99
     176             : 
     177     3491712 :          if (wantDebug) then
     178           0 :            if (useGPU .and. &
     179             : #if REALCASE == 1
     180             :              ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
     181             : #endif
     182             : #if COMPLEXCASE == 1
     183             :              ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
     184             : #endif
     185           0 :              print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
     186           0 :              stop
     187             :            endif
     188             :          endif
     189             : 
     190             : #if REALCASE == 1
     191     1294272 :          if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
     192             : #endif
     193             : #if COMPLEXCASE == 1
     194     2197440 :          if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
     195             : #endif
     196             :            ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
     197           0 :            if (ncols < 1) then
     198           0 :              if (wantDebug) then
     199           0 :                print *, "Returning early from compute_hh_trafo"
     200             :              endif
     201           0 :              return
     202             :            endif
     203             :          endif
     204             : 
     205     3491712 :          if (wantDebug) call obj%timer%start("compute_hh_trafo_&
     206             :    &MATH_DATATYPE&
     207             : #ifdef WITH_OPENMP
     208             :          &_openmp" // &
     209             : #else
     210             :          &" // &
     211             : #endif
     212             :          &PRECISION_SUFFIX &
     213           0 :          )
     214             : 
     215             : 
     216             : #ifdef WITH_OPENMP
     217     1745856 :          if (my_thread==1) then
     218             : #endif
     219     3491712 :            ttt = mpi_wtime()
     220             : #ifdef WITH_OPENMP
     221             :          endif
     222             : #endif
     223             : 
     224             : #ifdef WITH_OPENMP
     225             : 
     226             : #if REALCASE == 1
     227      647136 :          if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
     228             :            print *,"compute_hh_trafo_&
     229             :                    &MATH_DATATYPE&
     230           0 :                    &_GPU OPENMP: not yet implemented"
     231           0 :            stop 1
     232             :          endif
     233             : #endif
     234             : #if COMPLEXCASE == 1
     235     1098720 :          if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
     236             :            print *,"compute_hh_trafo_&
     237             :                    &MATH_DATATYPE&
     238           0 :                    &_GPU OPENMP: not yet implemented"
     239           0 :            stop 1
     240             :          endif
     241             : #endif
     242             : #endif /* WITH_OPENMP */
     243             : 
     244             : #ifndef WITH_OPENMP
     245     1745856 :          nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
     246             : #else /* WITH_OPENMP */
     247             : 
     248     1745856 :          if (istripe<stripe_count) then
     249     1428480 :            nl = stripe_width
     250             :          else
     251      317376 :            noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
     252      317376 :            nl = min(my_thread*thread_width-noff, l_nev-noff)
     253      317376 :            if (nl<=0) then
     254           0 :              if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
     255             :        &MATH_DATATYPE&
     256             : #ifdef WITH_OPENMP
     257             :              &_openmp" // &
     258             : #else
     259             :              &" // &
     260             : #endif
     261             :              &PRECISION_SUFFIX &
     262           0 :              )
     263             : 
     264           0 :              return
     265             :            endif
     266             :          endif
     267             : #endif /* not WITH_OPENMP */
     268             : 
     269             : #if REALCASE == 1
     270             : ! GPU kernel real
     271     1294272 :          if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
     272           0 :            if (wantDebug) then
     273           0 :              call obj%timer%start("compute_hh_trafo: GPU")
     274             :            endif
     275             :            dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
     276             :                   &PRECISION&
     277             :                   &_&
     278           0 :                   &MATH_DATATYPE
     279             : 
     280             :            call launch_compute_hh_trafo_gpu_kernel_&
     281             :                 &MATH_DATATYPE&
     282             :                 &_&
     283             :                 &PRECISION&
     284           0 :                 & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
     285             : #endif /* REALCASE */
     286             : #if COMPLEXCASE == 1
     287             : ! GPU kernel complex
     288     2197440 :          if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
     289           0 :            if (wantDebug) then
     290           0 :              call obj%timer%start("compute_hh_trafo: GPU")
     291             :            endif
     292             : 
     293             :            dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
     294             :                   &PRECISION&
     295             :                   &_&
     296           0 :                   &MATH_DATATYPE
     297             : 
     298             :            dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
     299             :                   &PRECISION&
     300             :                   &_&
     301           0 :                   &MATH_DATATYPE
     302             : 
     303             :            dev_offset_2 =( off-1 )* size_of_&
     304             :                   &PRECISION&
     305             :                   &_&
     306           0 :                   &MATH_DATATYPE
     307             : 
     308             :            call launch_compute_hh_trafo_gpu_kernel_&
     309             :                 &MATH_DATATYPE&
     310             :                 &_&
     311             :                 &PRECISION&
     312             :                 & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
     313           0 :                                                          hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
     314             : 
     315             : 
     316             : #endif /* COMPLEXCASE */
     317           0 :            if (wantDebug) then
     318           0 :              call obj%timer%stop("compute_hh_trafo: GPU")
     319             :            endif
     320             : 
     321             :          else ! not CUDA kernel
     322             : 
     323     3491712 :            if (wantDebug) then
     324           0 :              call obj%timer%start("compute_hh_trafo: CPU")
     325             :            endif
     326             : #if REALCASE == 1
     327             : #ifndef WITH_FIXED_REAL_KERNEL
     328             :          if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
     329             :              kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
     330             :              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
     331             :              kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
     332             :              kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
     333             :              kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
     334             :              kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
     335             :              kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
     336             :              kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
     337     1294272 :              kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
     338             :              kernel .eq. ELPA_2STAGE_REAL_BGQ) then
     339             : #endif /* not WITH_FIXED_REAL_KERNEL */
     340             : 
     341             : #endif /* REALCASE */
     342             : !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     343             : 
     344             :              !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
     345             : #if REALCASE == 1
     346             : ! generic kernel real case
     347             : #if defined(WITH_REAL_GENERIC_KERNEL)
     348             : #ifndef WITH_FIXED_REAL_KERNEL
     349      987840 :              if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
     350             : #endif /* not WITH_FIXED_REAL_KERNEL */
     351             : 
     352      510720 :                do j = ncols, 2, -2
     353      466944 :                  w(:,1) = bcast_buffer(1:nbw,j+off)
     354      466944 :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
     355             : 
     356             : #ifdef WITH_OPENMP
     357             : 
     358             : #ifdef USE_ASSUMED_SIZE
     359             :                  call double_hh_trafo_&
     360             :                       &MATH_DATATYPE&
     361             :                       &_generic_&
     362             :                       &PRECISION&
     363      116736 :                       & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
     364             : 
     365             : #else
     366             :                  call double_hh_trafo_&
     367             :                       &MATH_DATATYPE&
     368             :                       &_generic_&
     369             :                       &PRECISION&
     370             :                       & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
     371      116736 :                     nbw, nl, stripe_width, nbw)
     372             : #endif
     373             : 
     374             : #else /* WITH_OPENMP */
     375             : 
     376             : #ifdef USE_ASSUMED_SIZE
     377             :                  call double_hh_trafo_&
     378             :                       &MATH_DATATYPE&
     379             :                       &_generic_&
     380             :                       &PRECISION&
     381      116736 :                       & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
     382             : 
     383             : #else
     384             :                  call double_hh_trafo_&
     385             :                       &MATH_DATATYPE&
     386             :                       &_generic_&
     387             :                       &PRECISION&
     388      116736 :                       & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
     389             : #endif
     390             : #endif /* WITH_OPENMP */
     391             : 
     392             :                enddo
     393             : 
     394             : #ifndef WITH_FIXED_REAL_KERNEL
     395             :              endif
     396             : #endif /* not WITH_FIXED_REAL_KERNEL */
     397             : #endif /* WITH_REAL_GENERIC_KERNEL */
     398             : 
     399             : #endif /* REALCASE == 1 */
     400             : 
     401             : #if COMPLEXCASE == 1
     402             : ! generic kernel complex case
     403             : #if defined(WITH_COMPLEX_GENERIC_KERNEL)
     404             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     405             :            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
     406     2197440 :                kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
     407             :                kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
     408             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     409       72576 :              ttt = mpi_wtime()
     410     1558656 :              do j = ncols, 1, -1
     411             : #ifdef WITH_OPENMP
     412             : #ifdef USE_ASSUMED_SIZE
     413             : 
     414             :               call single_hh_trafo_&
     415             :                    &MATH_DATATYPE&
     416             :                    &_generic_&
     417             :                    &PRECISION&
     418      371520 :                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     419             : #else
     420             :               call single_hh_trafo_&
     421             :                    &MATH_DATATYPE&
     422             :                    &_generic_&
     423             :                    &PRECISION&
     424             :                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
     425      371520 :                      bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
     426             : #endif
     427             : 
     428             : #else /* WITH_OPENMP */
     429             : 
     430             : #ifdef USE_ASSUMED_SIZE
     431             :               call single_hh_trafo_&
     432             :                    &MATH_DATATYPE&
     433             :                    &_generic_&
     434             :                    &PRECISION&
     435      371520 :                    & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     436             : #else
     437             :               call single_hh_trafo_&
     438             :                    &MATH_DATATYPE&
     439             :                    &_generic_&
     440             :                    &PRECISION&
     441             :                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
     442      371520 :                       nbw, nl, stripe_width)
     443             : #endif
     444             : #endif /* WITH_OPENMP */
     445             : 
     446             :             enddo
     447             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     448             :           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
     449             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     450             : #endif /* WITH_COMPLEX_GENERIC_KERNEL */
     451             : 
     452             : #endif /* COMPLEXCASE */
     453             : 
     454             : #if REALCASE == 1
     455             : 
     456             : 
     457             : ! generic simple real kernel
     458             : #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
     459             : #ifndef WITH_FIXED_REAL_KERNEL
     460      987840 :              if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
     461             : #endif /* not WITH_FIXED_REAL_KERNEL */
     462      645120 :                do j = ncols, 2, -2
     463      589824 :                  w(:,1) = bcast_buffer(1:nbw,j+off)
     464      589824 :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
     465             : #ifdef WITH_OPENMP
     466             : 
     467             : #ifdef USE_ASSUMED_SIZE
     468             :                  call double_hh_trafo_&
     469             :                       &MATH_DATATYPE&
     470             :                       &_generic_simple_&
     471             :                       &PRECISION&
     472      147456 :                       & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
     473             : #else
     474             :                  call double_hh_trafo_&
     475             :                       &MATH_DATATYPE&
     476             :                       &_generic_simple_&
     477             :                       &PRECISION&
     478      147456 :                       & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
     479             : 
     480             : #endif
     481             : 
     482             : #else /* WITH_OPENMP */
     483             : 
     484             : #ifdef USE_ASSUMED_SIZE
     485             :                  call double_hh_trafo_&
     486             :                       &MATH_DATATYPE&
     487             :                       &_generic_simple_&
     488             :                       &PRECISION&
     489      147456 :                       & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
     490             : #else
     491             :                  call double_hh_trafo_&
     492             :                       &MATH_DATATYPE&
     493             :                       &_generic_simple_&
     494             :                       &PRECISION&
     495      147456 :                       & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
     496             : 
     497             : #endif
     498             : 
     499             : #endif /* WITH_OPENMP */
     500             : 
     501             :                enddo
     502             : #ifndef WITH_FIXED_REAL_KERNEL
     503             :              endif
     504             : #endif /* not WITH_FIXED_REAL_KERNEL */
     505             : #endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
     506             : 
     507             : #endif /* REALCASE */
     508             : 
     509             : #if COMPLEXCASE == 1
     510             : ! generic simple complex case
     511             : 
     512             : #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
     513             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     514     2197440 :             if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
     515             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     516       96768 :              ttt = mpi_wtime()
     517     2078208 :              do j = ncols, 1, -1
     518             : #ifdef WITH_OPENMP
     519             : #ifdef USE_ASSUMED_SIZE
     520             :                call single_hh_trafo_&
     521             :                     &MATH_DATATYPE&
     522             :                     &_generic_simple_&
     523             :                     &PRECISION&
     524      495360 :                     & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     525             : #else
     526             :                call single_hh_trafo_&
     527             :                     &MATH_DATATYPE&
     528             :                     &_generic_simple_&
     529             :                     &PRECISION&
     530             :                     & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
     531      495360 :                        nbw, nl, stripe_width)
     532             : #endif
     533             : 
     534             : #else /* WITH_OPENMP */
     535             : 
     536             : #ifdef USE_ASSUMED_SIZE
     537             :                call single_hh_trafo_&
     538             :                      &MATH_DATATYPE&
     539             :                      &_generic_simple_&
     540             :                      &PRECISION&
     541      495360 :                      & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     542             : #else
     543             :                call single_hh_trafo_&
     544             :                     &MATH_DATATYPE&
     545             :                     &_generic_simple_&
     546             :                     &PRECISION&
     547             :                     & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
     548      495360 :                        nbw, nl, stripe_width)
     549             : #endif
     550             : 
     551             : #endif /* WITH_OPENMP */
     552             :              enddo
     553             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     554             :            endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
     555             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     556             : #endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
     557             : 
     558             : #endif /* COMPLEXCASE */
     559             : 
     560             : #if REALCASE == 1
     561             : ! sse assembly kernel real case
     562             : #if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
     563             : #ifndef WITH_FIXED_REAL_KERNEL
     564      987840 :              if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
     565             : 
     566             : #endif /* not WITH_FIXED_REAL_KERNEL */
     567      510720 :                do j = ncols, 2, -2
     568      466944 :                  w(:,1) = bcast_buffer(1:nbw,j+off)
     569      466944 :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
     570             : #ifdef WITH_OPENMP
     571             :                  call double_hh_trafo_&
     572             :                  &MATH_DATATYPE&
     573             :                  &_&
     574             :                  &PRECISION&
     575             :                  &_sse_assembly&
     576      233472 :                  & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     577             : #else
     578             :                  call double_hh_trafo_&
     579             :                       &MATH_DATATYPE&
     580             :                       &_&
     581             :                       &PRECISION&
     582             :                       &_sse_assembly&
     583      233472 :                       & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     584             : #endif
     585             :                enddo
     586             : #ifndef WITH_FIXED_REAL_KERNEL
     587             :              endif
     588             : #endif /* not WITH_FIXED_REAL_KERNEL */
     589             : #endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
     590             : 
     591             : #endif /* REALCASE */
     592             : 
     593             : #if COMPLEXCASE == 1
     594             : 
     595             : ! sse assembly kernel complex case
     596             : #if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
     597             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     598     2197440 :            if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
     599             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     600       72576 :              ttt = mpi_wtime()
     601     1558656 :              do j = ncols, 1, -1
     602             : #ifdef WITH_OPENMP
     603             :               call single_hh_trafo_&
     604             :                    &MATH_DATATYPE&
     605             :                    &_&
     606             :                    &PRECISION&
     607             :                    &_sse_assembly&
     608      743040 :                    & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     609             : #else
     610             :               call single_hh_trafo_&
     611             :                    &MATH_DATATYPE&
     612             :                    &_&
     613             :                    &PRECISION&
     614             :                    &_sse_assembly&
     615      743040 :                    & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     616             : #endif
     617             :             enddo
     618             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     619             :           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
     620             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     621             : #endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
     622             : #endif /* COMPLEXCASE */
     623             : 
     624             : #if REALCASE == 1
     625             : ! no sse, vsx, sparc64 block1 real kernel
     626             : #endif
     627             : 
     628             : #if COMPLEXCASE == 1
     629             : 
     630             : ! sparc64 block1 complex kernel
     631             : #if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
     632             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     633             : !          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
     634             : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
     635             : !
     636             : !#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
     637             : !            ttt = mpi_wtime()
     638             : !            do j = ncols, 1, -1
     639             : !#ifdef WITH_OPENMP
     640             : !              call single_hh_trafo_&
     641             : !                   &MATH_DATATYPE&
     642             : !                   &_sparc64_1hv_&
     643             : !                   &PRECISION&
     644             : !                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     645             : !#else
     646             : !              call single_hh_trafo_&
     647             : !                   &MATH_DATATYPE&
     648             : !                   &_sparc64_1hv_&
     649             : !                   &PRECISION&
     650             : !                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     651             : !#endif
     652             : !            enddo
     653             : !#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
     654             : !
     655             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     656             : !          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
     657             : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
     658             : #endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
     659             : 
     660             : #endif /* COMPLEXCASE */
     661             : 
     662             : 
     663             : #if COMPLEXCASE == 1
     664             : 
     665             : ! vsx block1 complex kernel
     666             : #if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
     667             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     668             : !          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
     669             : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
     670             : !
     671             : !#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
     672             : !            ttt = mpi_wtime()
     673             : !            do j = ncols, 1, -1
     674             : !#ifdef WITH_OPENMP
     675             : !              call single_hh_trafo_&
     676             : !                   &MATH_DATATYPE&
     677             : !                   &_vsx_1hv_&
     678             : !                   &PRECISION&
     679             : !                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     680             : !#else
     681             : !              call single_hh_trafo_&
     682             : !                   &MATH_DATATYPE&
     683             : !                   &_vsx_1hv_&
     684             : !                   &PRECISION&
     685             : !                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     686             : !#endif
     687             : !            enddo
     688             : !#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
     689             : !
     690             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     691             : !          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
     692             : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
     693             : #endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
     694             : 
     695             : #endif /* COMPLEXCASE */
     696             : 
     697             : 
     698             : #if COMPLEXCASE == 1
     699             : 
     700             : ! sse block1 complex kernel
     701             : #if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
     702             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     703     2197440 :           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
     704             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     705             : 
     706             : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
     707       72576 :             ttt = mpi_wtime()
     708     1558656 :             do j = ncols, 1, -1
     709             : #ifdef WITH_OPENMP
     710             :               call single_hh_trafo_&
     711             :                    &MATH_DATATYPE&
     712             :                    &_sse_1hv_&
     713             :                    &PRECISION&
     714      743040 :                    & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     715             : #else
     716             :               call single_hh_trafo_&
     717             :                    &MATH_DATATYPE&
     718             :                    &_sse_1hv_&
     719             :                    &PRECISION&
     720      743040 :                    & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     721             : #endif
     722             :             enddo
     723             : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
     724             : 
     725             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     726             :           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
     727             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     728             : #endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
     729             : 
     730             : #endif /* COMPLEXCASE */
     731             : 
     732             : #if REALCASE == 1
     733             : !no avx block1 real kernel
     734             : #endif /* REALCASE */
     735             : 
     736             : #if COMPLEXCASE == 1
     737             : 
     738             : ! avx block1 complex kernel
     739             : #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
     740             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     741     2197440 :           if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
     742             :               (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
     743             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     744             : 
     745             : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
     746      868896 :             ttt = mpi_wtime()
     747    74396448 :             do j = ncols, 1, -1
     748             : #ifdef WITH_OPENMP
     749             :               call single_hh_trafo_&
     750             :                    &MATH_DATATYPE&
     751             :                    &_avx_avx2_1hv_&
     752             :                    &PRECISION&
     753    36763776 :                    & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     754             : #else
     755             :               call single_hh_trafo_&
     756             :                    &MATH_DATATYPE&
     757             :                    &_avx_avx2_1hv_&
     758             :                    &PRECISION&
     759    36763776 :                    & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     760             : #endif
     761             :             enddo
     762             : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
     763             : 
     764             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     765             :           endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
     766             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     767             : #endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
     768             : 
     769             : #endif /* COMPLEXCASE */
     770             : 
     771             : #if REALCASE == 1
     772             : ! no avx512 block1 real kernel
     773             : #endif /* REALCASE */
     774             : 
     775             : #if COMPLEXCASE == 1
     776             : 
     777             : ! avx512 block1 complex kernel
     778             : #if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
     779             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     780     1135008 :           if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
     781             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     782             : 
     783             : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
     784      760032 :             ttt = mpi_wtime()
     785    72058464 :             do j = ncols, 1, -1
     786             : #ifdef WITH_OPENMP
     787             :               call single_hh_trafo_&
     788             :                    &MATH_DATATYPE&
     789             :                    &_avx512_1hv_&
     790             :                    &PRECISION&
     791    35649216 :                    & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     792             : #else
     793             :               call single_hh_trafo_&
     794             :                    &MATH_DATATYPE&
     795             :                    &_avx512_1hv_&
     796             :                    &PRECISION&
     797    35649216 :                    & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
     798             : #endif
     799             :             enddo
     800             : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
     801             : 
     802             : #ifndef WITH_FIXED_COMPLEX_KERNEL
     803             :           endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
     804             : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
     805             : #endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
     806             : #endif /* COMPLEXCASE */
     807             : 
     808             : #if REALCASE == 1
     809             : ! implementation of sparc64 block 2 real case
     810             : #if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)
     811             : 
     812             : #ifndef WITH_FIXED_REAL_KERNEL
     813             :            if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then
     814             : 
     815             : #endif /* not WITH_FIXED_REAL_KERNEL */
     816             : 
     817             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
     818             :              do j = ncols, 2, -2
     819             :                w(:,1) = bcast_buffer(1:nbw,j+off)
     820             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
     821             : #ifdef WITH_OPENMP
     822             :                call double_hh_trafo_&
     823             :                     &MATH_DATATYPE&
     824             :                     &_sparc64_2hv_&
     825             :                     &PRECISION &
     826             :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     827             : #else
     828             :                call double_hh_trafo_&
     829             :                     &MATH_DATATYPE&
     830             :                     &_sparc64_2hv_&
     831             :                     &PRECISION &
     832             :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     833             : #endif
     834             :              enddo
     835             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */
     836             : 
     837             : #ifndef WITH_FIXED_REAL_KERNEL
     838             :            endif
     839             : #endif /* not WITH_FIXED_REAL_KERNEL */
     840             : #endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
     841             : 
     842             : #endif /* REALCASE == 1 */
     843             : 
     844             : 
     845             : #if REALCASE == 1
     846             : ! implementation of vsx block 2 real case
     847             : #if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
     848             : 
     849             : #ifndef WITH_FIXED_REAL_KERNEL
     850             :            if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
     851             : 
     852             : #endif /* not WITH_FIXED_REAL_KERNEL */
     853             : 
     854             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
     855             :              do j = ncols, 2, -2
     856             :                w(:,1) = bcast_buffer(1:nbw,j+off)
     857             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
     858             : #ifdef WITH_OPENMP
     859             :                call double_hh_trafo_&
     860             :                     &MATH_DATATYPE&
     861             :                     &_vsx_2hv_&
     862             :                     &PRECISION &
     863             :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     864             : #else
     865             :                call double_hh_trafo_&
     866             :                     &MATH_DATATYPE&
     867             :                     &_vsx_2hv_&
     868             :                     &PRECISION &
     869             :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     870             : #endif
     871             :              enddo
     872             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
     873             : 
     874             : #ifndef WITH_FIXED_REAL_KERNEL
     875             :            endif
     876             : #endif /* not WITH_FIXED_REAL_KERNEL */
     877             : #endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
     878             : 
     879             : #endif /* REALCASE == 1 */
     880             : 
     881             : #if REALCASE == 1
     882             : ! implementation of sse block 2 real case
     883             : #if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
     884             : 
     885             : #ifndef WITH_FIXED_REAL_KERNEL
     886      987840 :            if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
     887             : 
     888             : #endif /* not WITH_FIXED_REAL_KERNEL */
     889             : 
     890             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
     891      510720 :              do j = ncols, 2, -2
     892      466944 :                w(:,1) = bcast_buffer(1:nbw,j+off)
     893      466944 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
     894             : #ifdef WITH_OPENMP
     895             :                call double_hh_trafo_&
     896             :                     &MATH_DATATYPE&
     897             :                     &_sse_2hv_&
     898             :                     &PRECISION &
     899      233472 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     900             : #else
     901             :                call double_hh_trafo_&
     902             :                     &MATH_DATATYPE&
     903             :                     &_sse_2hv_&
     904             :                     &PRECISION &
     905      233472 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     906             : #endif
     907             :              enddo
     908             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
     909             : 
     910             : #ifndef WITH_FIXED_REAL_KERNEL
     911             :            endif
     912             : #endif /* not WITH_FIXED_REAL_KERNEL */
     913             : #endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
     914             : 
     915             : #endif /* REALCASE == 1 */
     916             : 
     917             : 
     918             : #if COMPLEXCASE == 1
     919             : ! implementation of sparc64 block 2 complex case
     920             : 
     921             : #if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
     922             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     923             : !           if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
     924             : !#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
     925             : !
     926             : !             ttt = mpi_wtime()
     927             : !             do j = ncols, 2, -2
     928             : !               w(:,1) = bcast_buffer(1:nbw,j+off)
     929             : !               w(:,2) = bcast_buffer(1:nbw,j+off-1)
     930             : !#ifdef WITH_OPENMP
     931             : !               call double_hh_trafo_&
     932             : !                    &MATH_DATATYPE&
     933             : !                    &_sparc64_2hv_&
     934             : !                    &PRECISION&
     935             : !                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     936             : !#else
     937             : !               call double_hh_trafo_&
     938             : !                    &MATH_DATATYPE&
     939             : !                    &_sparc64_2hv_&
     940             : !                    &PRECISION&
     941             : !                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     942             : !#endif
     943             : !             enddo
     944             : !#ifdef WITH_OPENMP
     945             : !             if (j==1) call single_hh_trafo_&
     946             : !                 &MATH_DATATYPE&
     947             : !                       &_sparc64_1hv_&
     948             : !                       &PRECISION&
     949             : !                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
     950             : !#else
     951             : !             if (j==1) call single_hh_trafo_&
     952             : !                 &MATH_DATATYPE&
     953             : !                            &_sparc64_1hv_&
     954             : !                            &PRECISION&
     955             : !                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
     956             : !#endif
     957             : !
     958             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     959             : !           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
     960             : !#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
     961             : #endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
     962             : #endif /* COMPLEXCASE == 1 */
     963             : 
     964             : 
     965             : #if COMPLEXCASE == 1
     966             : ! implementation of vsx block 2 complex case
     967             : 
     968             : #if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
     969             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
     970             : !           if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
     971             : !#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
     972             : !
     973             : !             ttt = mpi_wtime()
     974             : !             do j = ncols, 2, -2
     975             : !               w(:,1) = bcast_buffer(1:nbw,j+off)
     976             : !               w(:,2) = bcast_buffer(1:nbw,j+off-1)
     977             : !#ifdef WITH_OPENMP
     978             : !               call double_hh_trafo_&
     979             : !                    &MATH_DATATYPE&
     980             : !                    &_vsx_2hv_&
     981             : !                    &PRECISION&
     982             : !                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
     983             : !#else
     984             : !               call double_hh_trafo_&
     985             : !                    &MATH_DATATYPE&
     986             : !                    &_vsx_2hv_&
     987             : !                    &PRECISION&
     988             : !                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
     989             : !#endif
     990             : !             enddo
     991             : !#ifdef WITH_OPENMP
     992             : !             if (j==1) call single_hh_trafo_&
     993             : !                 &MATH_DATATYPE&
     994             : !                       &_vsx_1hv_&
     995             : !                       &PRECISION&
     996             : !                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
     997             : !#else
     998             : !             if (j==1) call single_hh_trafo_&
     999             : !                 &MATH_DATATYPE&
    1000             : !                            &_vsx_1hv_&
    1001             : !                            &PRECISION&
    1002             : !                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1003             : !#endif
    1004             : !
    1005             : !#ifndef WITH_FIXED_COMPLEX_KERNEL
    1006             : !           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
    1007             : !#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1008             : #endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
    1009             : #endif /* COMPLEXCASE == 1 */
    1010             : 
    1011             : #if COMPLEXCASE == 1
    1012             : ! implementation of sse block 2 complex case
    1013             : 
    1014             : #if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
    1015             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1016     2197440 :            if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
    1017             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1018             : 
    1019       72576 :              ttt = mpi_wtime()
    1020      808704 :              do j = ncols, 2, -2
    1021      736128 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1022      736128 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1023             : #ifdef WITH_OPENMP
    1024             :                call double_hh_trafo_&
    1025             :                     &MATH_DATATYPE&
    1026             :                     &_sse_2hv_&
    1027             :                     &PRECISION&
    1028      368064 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1029             : #else
    1030             :                call double_hh_trafo_&
    1031             :                     &MATH_DATATYPE&
    1032             :                     &_sse_2hv_&
    1033             :                     &PRECISION&
    1034      368064 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1035             : #endif
    1036             :              enddo
    1037             : #ifdef WITH_OPENMP
    1038       36288 :              if (j==1) call single_hh_trafo_&
    1039             :                  &MATH_DATATYPE&
    1040             :                        &_sse_1hv_&
    1041             :                        &PRECISION&
    1042        6912 :                        & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1043             : #else
    1044       36288 :              if (j==1) call single_hh_trafo_&
    1045             :                  &MATH_DATATYPE&
    1046             :                             &_sse_1hv_&
    1047             :                             &PRECISION&
    1048        6912 :                             & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1049             : #endif
    1050             : 
    1051             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1052             :            endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
    1053             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1054             : #endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
    1055             : #endif /* COMPLEXCASE == 1 */
    1056             : 
    1057             : #if REALCASE == 1
    1058             : ! implementation of avx block 2 real case
    1059             : 
    1060             : #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
    1061             : #ifndef WITH_FIXED_REAL_KERNEL
    1062             : 
    1063      987840 :            if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
    1064             :                (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2))  then
    1065             : 
    1066             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1067             : 
    1068             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
    1069    17778672 :                do j = ncols, 2, -2
    1070    17345232 :                  w(:,1) = bcast_buffer(1:nbw,j+off)
    1071    17345232 :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1072             : #ifdef WITH_OPENMP
    1073             : 
    1074             :                call double_hh_trafo_&
    1075             :                     &MATH_DATATYPE&
    1076             :                     &_avx_avx2_2hv_&
    1077             :                     &PRECISION&
    1078     8672616 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1079             : #else
    1080             :                call double_hh_trafo_&
    1081             :                     &MATH_DATATYPE&
    1082             :                     &_avx_avx2_2hv_&
    1083             :                     &PRECISION&
    1084     8672616 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1085             : #endif
    1086             :                enddo
    1087             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
    1088             : 
    1089             : #ifndef WITH_FIXED_REAL_KERNEL
    1090             :              endif
    1091             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1092             : #endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */
    1093             : 
    1094             : #endif /* REALCASE */
    1095             : 
    1096             : #if COMPLEXCASE == 1
    1097             : 
    1098             : ! implementation of avx block 2 complex case
    1099             : #if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
    1100             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1101     2197440 :            if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
    1102             :                 (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
    1103             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1104             : 
    1105      145152 :               ttt = mpi_wtime()
    1106     1617408 :              do j = ncols, 2, -2
    1107     1472256 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1108     1472256 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1109             : #ifdef WITH_OPENMP
    1110             :                call double_hh_trafo_&
    1111             :                     &MATH_DATATYPE&
    1112             :                     &_avx_avx2_2hv_&
    1113             :                     &PRECISION&
    1114      736128 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1115             : #else
    1116             :                call double_hh_trafo_&
    1117             :                     &MATH_DATATYPE&
    1118             :                     &_avx_avx2_2hv_&
    1119             :                     &PRECISION&
    1120      736128 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1121             : #endif
    1122             :              enddo
    1123             : #ifdef WITH_OPENMP
    1124       72576 :              if (j==1) call single_hh_trafo_&
    1125             :                  &MATH_DATATYPE&
    1126             :                  &_avx_avx2_1hv_&
    1127             :                  &PRECISION&
    1128       13824 :                  & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1129             : #else
    1130       72576 :              if (j==1) call single_hh_trafo_&
    1131             :                  &MATH_DATATYPE&
    1132             :                  &_avx_avx2_1hv_&
    1133             :                  &PRECISION&
    1134       13824 :                  & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1135             : #endif
    1136             : 
    1137             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1138             :            endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
    1139             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1140             : #endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */
    1141             : 
    1142             : #endif /* COMPLEXCASE */
    1143             : 
    1144             : #if REALCASE == 1
    1145             : ! implementation of avx512 block 2 real case
    1146             : 
    1147             : #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
    1148             : #ifndef WITH_FIXED_REAL_KERNEL
    1149             : 
    1150      504864 :            if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
    1151             : 
    1152             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1153             : 
    1154             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
    1155    17012592 :                do j = ncols, 2, -2
    1156    16644816 :                  w(:,1) = bcast_buffer(1:nbw,j+off)
    1157    16644816 :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1158             : #ifdef WITH_OPENMP
    1159             : 
    1160             :                call double_hh_trafo_&
    1161             :                     &MATH_DATATYPE&
    1162             :                     &_avx512_2hv_&
    1163             :                     &PRECISION&
    1164     8322408 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1165             : #else
    1166             :                call double_hh_trafo_&
    1167             :                     &MATH_DATATYPE&
    1168             :                     &_avx512_2hv_&
    1169             :                     &PRECISION&
    1170     8322408 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1171             : #endif
    1172             :                enddo
    1173             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
    1174             : 
    1175             : #ifndef WITH_FIXED_REAL_KERNEL
    1176             :              endif
    1177             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1178             : #endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
    1179             : 
    1180             : #endif /* REALCASE */
    1181             : 
    1182             : #if COMPLEXCASE == 1
    1183             : 
    1184             : ! implementation of avx512 block 2 complex case
    1185             : #if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
    1186             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1187     1135008 :            if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) then
    1188             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1189             : 
    1190       36288 :               ttt = mpi_wtime()
    1191      404352 :              do j = ncols, 2, -2
    1192      368064 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1193      368064 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1194             : #ifdef WITH_OPENMP
    1195             :                call double_hh_trafo_&
    1196             :                     &MATH_DATATYPE&
    1197             :                     &_avx512_2hv_&
    1198             :                     &PRECISION&
    1199      184032 :                     & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1200             : #else
    1201             :                call double_hh_trafo_&
    1202             :                     &MATH_DATATYPE&
    1203             :                     &_avx512_2hv_&
    1204             :                     &PRECISION&
    1205      184032 :                     & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1206             : #endif
    1207             :              enddo
    1208             : #ifdef WITH_OPENMP
    1209       18144 :              if (j==1) call single_hh_trafo_&
    1210             :                  &MATH_DATATYPE&
    1211             :                  &_avx512_1hv_&
    1212             :                  &PRECISION&
    1213        3456 :                  & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1214             : #else
    1215       18144 :              if (j==1) call single_hh_trafo_&
    1216             :                  &MATH_DATATYPE&
    1217             :                  &_avx512_1hv_&
    1218             :                  &PRECISION&
    1219        3456 :                  & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
    1220             : #endif
    1221             : 
    1222             : #ifndef WITH_FIXED_COMPLEX_KERNEL
    1223             :            endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
    1224             : #endif  /* not WITH_FIXED_COMPLEX_KERNEL */
    1225             : #endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
    1226             : #endif /* COMPLEXCASE */
    1227             : 
    1228             : 
    1229             : #if REALCASE == 1
    1230             : 
    1231             : #if defined(WITH_REAL_BGP_KERNEL)
    1232             : #ifndef WITH_FIXED_REAL_KERNEL
    1233             :              if (kernel .eq. ELPA_2STAGE_REAL_BGP) then
    1234             : 
    1235             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1236             :                do j = ncols, 2, -2
    1237             :                  w(:,1) = bcast_buffer(1:nbw,j+off)
    1238             :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1239             : #ifdef WITH_OPENMP
    1240             :                  call double_hh_trafo_bgp_&
    1241             :                       &PRECISION&
    1242             :                       & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
    1243             : #else
    1244             :                  call double_hh_trafo_bgp_&
    1245             :                       &PRECISION&
    1246             :                       & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
    1247             : #endif
    1248             :                enddo
    1249             : #ifndef WITH_FIXED_REAL_KERNEL
    1250             :              endif
    1251             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1252             : #endif /* WITH_REAL_BGP_KERNEL */
    1253             : 
    1254             : #if defined(WITH_REAL_BGQ_KERNEL)
    1255             : #ifndef WITH_FIXED_REAL_KERNEL
    1256             :              if (kernel .eq. ELPA_2STAGE_REAL_BGQ) then
    1257             : 
    1258             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1259             :                do j = ncols, 2, -2
    1260             :                  w(:,1) = bcast_buffer(1:nbw,j+off)
    1261             :                  w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1262             : #ifdef WITH_OPENMP
    1263             :                  call double_hh_trafo_bgq_&
    1264             :                       &PRECISION&
    1265             :                       & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
    1266             : #else
    1267             :                  call double_hh_trafo_bgq_&PRECISION&
    1268             :                       & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
    1269             : #endif
    1270             :                enddo
    1271             : #ifndef WITH_FIXED_REAL_KERNEL
    1272             :              endif
    1273             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1274             : #endif /* WITH_REAL_BGQ_KERNEL */
    1275             : 
    1276             : #endif /* REALCASE */
    1277             : 
    1278             : #if COMPLEXCASE == 1
    1279             : ! complex bgp/bgq kernel implemented
    1280             : #endif
    1281             : 
    1282             : 
    1283             : #if REALCASE == 1
    1284             : #ifdef WITH_OPENMP
    1285      493920 :              if (j==1) call single_hh_trafo_&
    1286             :                  &MATH_DATATYPE&
    1287             :                  &_cpu_openmp_&
    1288             :                  &PRECISION&
    1289             :                  & (a(1:stripe_width, 1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
    1290       39168 :                           bcast_buffer(1:nbw,off+1), nbw, nl,stripe_width)
    1291             : #else
    1292      493920 :              if (j==1) call single_hh_trafo_&
    1293             :                  &MATH_DATATYPE&
    1294             :                  &_cpu_&
    1295             :                  &PRECISION&
    1296             :                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
    1297       39168 :                           stripe_width)
    1298             : #endif
    1299             : 
    1300             : #endif /* REALCASE == 1 */
    1301             : 
    1302             : #if REALCASE == 1
    1303             : #ifndef WITH_FIXED_REAL_KERNEL
    1304             :            endif !
    1305             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1306             : #endif /* REALCASE == 1 */
    1307             : 
    1308             : #if REALCASE == 1
    1309             : ! sparc64 block4 real kernel
    1310             : 
    1311             : #if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
    1312             : #ifndef WITH_FIXED_REAL_KERNEL
    1313             :            if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4) then
    1314             : 
    1315             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1316             : 
    1317             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
    1318             :              ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
    1319             :              do j = ncols, 4, -4
    1320             :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1321             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1322             :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1323             :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1324             : #ifdef WITH_OPENMP
    1325             :                call quad_hh_trafo_&
    1326             :                     &MATH_DATATYPE&
    1327             :                     &_sparc64_4hv_&
    1328             :                     &PRECISION&
    1329             :                     & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1330             : #else
    1331             :                call quad_hh_trafo_&
    1332             :                     &MATH_DATATYPE&
    1333             :                     &_sparc64_4hv_&
    1334             :                     &PRECISION&
    1335             :                     & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1336             : #endif
    1337             :              enddo
    1338             :              do jj = j, 2, -2
    1339             :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1340             :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1341             : #ifdef WITH_OPENMP
    1342             :                call double_hh_trafo_&
    1343             :                     &MATH_DATATYPE&
    1344             :                     &_sparc64_2hv_&
    1345             :                     &PRECISION&
    1346             :                     & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1347             : #else
    1348             :                call double_hh_trafo_&
    1349             :                     &MATH_DATATYPE&
    1350             :                     &_sparc64_2hv_&
    1351             :                     &PRECISION&
    1352             :                     & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1353             : #endif
    1354             :              enddo
    1355             : #ifdef WITH_OPENMP
    1356             :              if (jj==1) call single_hh_trafo_&
    1357             :                   &MATH_DATATYPE&
    1358             :                   &_cpu_openmp_&
    1359             :                   &PRECISION&
    1360             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1361             :                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1362             : #else
    1363             :              if (jj==1) call single_hh_trafo_&
    1364             :                   &MATH_DATATYPE&
    1365             :                   &_cpu_&
    1366             :                   &PRECISION&
    1367             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1368             : #endif
    1369             : 
    1370             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)) */
    1371             : 
    1372             : #ifndef WITH_FIXED_REAL_KERNEL
    1373             :            endif
    1374             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1375             : #endif /* WITH_REAL_SPARC64_BLOCK4_KERNEL */
    1376             : 
    1377             : #endif /* REALCASE */
    1378             : 
    1379             : #if REALCASE == 1
    1380             : ! vsx block4 real kernel
    1381             : 
    1382             : #if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
    1383             : #ifndef WITH_FIXED_REAL_KERNEL
    1384             :            if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4) then
    1385             : 
    1386             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1387             : 
    1388             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
    1389             :              ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
    1390             :              do j = ncols, 4, -4
    1391             :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1392             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1393             :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1394             :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1395             : #ifdef WITH_OPENMP
    1396             :                call quad_hh_trafo_&
    1397             :                     &MATH_DATATYPE&
    1398             :                     &_vsx_4hv_&
    1399             :                     &PRECISION&
    1400             :                     & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1401             : #else
    1402             :                call quad_hh_trafo_&
    1403             :                     &MATH_DATATYPE&
    1404             :                     &_vsx_4hv_&
    1405             :                     &PRECISION&
    1406             :                     & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1407             : #endif
    1408             :              enddo
    1409             :              do jj = j, 2, -2
    1410             :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1411             :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1412             : #ifdef WITH_OPENMP
    1413             :                call double_hh_trafo_&
    1414             :                     &MATH_DATATYPE&
    1415             :                     &_vsx_2hv_&
    1416             :                     &PRECISION&
    1417             :                     & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1418             : #else
    1419             :                call double_hh_trafo_&
    1420             :                     &MATH_DATATYPE&
    1421             :                     &_vsx_2hv_&
    1422             :                     &PRECISION&
    1423             :                     & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1424             : #endif
    1425             :              enddo
    1426             : #ifdef WITH_OPENMP
    1427             :              if (jj==1) call single_hh_trafo_&
    1428             :                   &MATH_DATATYPE&
    1429             :                   &_cpu_openmp_&
    1430             :                   &PRECISION&
    1431             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1432             :                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1433             : #else
    1434             :              if (jj==1) call single_hh_trafo_&
    1435             :                   &MATH_DATATYPE&
    1436             :                   &_cpu_&
    1437             :                   &PRECISION&
    1438             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1439             : #endif
    1440             : 
    1441             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
    1442             : 
    1443             : #ifndef WITH_FIXED_REAL_KERNEL
    1444             :            endif
    1445             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1446             : #endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
    1447             : 
    1448             : #endif /* REALCASE */
    1449             : 
    1450             : #if REALCASE == 1
    1451             : ! sse block4 real kernel
    1452             : 
    1453             : #if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
    1454             : #ifndef WITH_FIXED_REAL_KERNEL
    1455     1294272 :            if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK4) then
    1456             : 
    1457             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1458             : 
    1459             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
    1460             :              ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
    1461      269952 :              do j = ncols, 4, -4
    1462      226176 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1463      226176 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1464      226176 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1465      226176 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1466             : #ifdef WITH_OPENMP
    1467             :                call quad_hh_trafo_&
    1468             :                     &MATH_DATATYPE&
    1469             :                     &_sse_4hv_&
    1470             :                     &PRECISION&
    1471      113088 :                     & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1472             : #else
    1473             :                call quad_hh_trafo_&
    1474             :                     &MATH_DATATYPE&
    1475             :                     &_sse_4hv_&
    1476             :                     &PRECISION&
    1477      113088 :                     & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1478             : #endif
    1479             :              enddo
    1480       58368 :              do jj = j, 2, -2
    1481       14592 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1482       14592 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1483             : #ifdef WITH_OPENMP
    1484             :                call double_hh_trafo_&
    1485             :                     &MATH_DATATYPE&
    1486             :                     &_sse_2hv_&
    1487             :                     &PRECISION&
    1488        7296 :                     & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1489             : #else
    1490             :                call double_hh_trafo_&
    1491             :                     &MATH_DATATYPE&
    1492             :                     &_sse_2hv_&
    1493             :                     &PRECISION&
    1494        7296 :                     & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1495             : #endif
    1496             :              enddo
    1497             : #ifdef WITH_OPENMP
    1498       21888 :              if (jj==1) call single_hh_trafo_&
    1499             :                   &MATH_DATATYPE&
    1500             :                   &_cpu_openmp_&
    1501             :                   &PRECISION&
    1502             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1503        3648 :                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1504             : #else
    1505       21888 :              if (jj==1) call single_hh_trafo_&
    1506             :                   &MATH_DATATYPE&
    1507             :                   &_cpu_&
    1508             :                   &PRECISION&
    1509        3648 :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1510             : #endif
    1511             : 
    1512             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
    1513             : 
    1514             : #ifndef WITH_FIXED_REAL_KERNEL
    1515             :            endif
    1516             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1517             : #endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
    1518             : 
    1519             : #endif /* REALCASE */
    1520             : 
    1521             : #if COMPLEXCASE == 1
    1522             : !no sse block4 complex kernel
    1523             : #endif /* COMPLEXCASE */
    1524             : 
    1525             : #if REALCASE == 1
    1526             : ! avx block4 real kernel
    1527             : #if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
    1528             : #ifndef WITH_FIXED_REAL_KERNEL
    1529     1294272 :            if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK4) .or.  &
    1530             :                (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK4)) then
    1531             : 
    1532             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1533             : 
    1534             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
    1535             :              ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
    1536      539904 :              do j = ncols, 4, -4
    1537      452352 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1538      452352 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1539      452352 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1540      452352 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1541             : #ifdef WITH_OPENMP
    1542             :                call quad_hh_trafo_&
    1543             :                     &MATH_DATATYPE&
    1544             :                     &_avx_avx2_4hv_&
    1545             :                     &PRECISION&
    1546      226176 :                     & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1547             : #else
    1548             :                call quad_hh_trafo_&
    1549             :                     &MATH_DATATYPE&
    1550             :                     &_avx_avx2_4hv_&
    1551             :                     &PRECISION&
    1552      226176 :                     & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1553             : #endif
    1554             :              enddo
    1555      116736 :              do jj = j, 2, -2
    1556       29184 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1557       29184 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1558             : #ifdef WITH_OPENMP
    1559             :                call double_hh_trafo_&
    1560             :                     &MATH_DATATYPE&
    1561             :                     &_avx_avx2_2hv_&
    1562             :                     &PRECISION&
    1563       14592 :                     & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)),w, nbw, nl, stripe_width, nbw)
    1564             : #else
    1565             :                call double_hh_trafo_&
    1566             :                     &MATH_DATATYPE&
    1567             :                     &_avx_avx2_2hv_&
    1568             :                     &PRECISION&
    1569       14592 :                     & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1570             : #endif
    1571             :              enddo
    1572             : #ifdef WITH_OPENMP
    1573       43776 :              if (jj==1) call single_hh_trafo_&
    1574             :                   &MATH_DATATYPE&
    1575             :                   &_cpu_openmp_&
    1576             :                   &PRECISION&
    1577             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1578        7296 :                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1579             : #else
    1580       43776 :              if (jj==1) call single_hh_trafo_&
    1581             :                   &MATH_DATATYPE&
    1582             :                   &_cpu_&
    1583             :                   &PRECISION&
    1584        7296 :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1585             : #endif
    1586             : 
    1587             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
    1588             : 
    1589             : #ifndef WITH_FIXED_REAL_KERNEL
    1590             :            endif
    1591             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1592             : #endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
    1593             : 
    1594             : #endif /* REALCASE */
    1595             : 
    1596             : #if COMPLEXCASE == 1
    1597             : !no avx block4 complex kernel
    1598             : #endif
    1599             : 
    1600             : #if REALCASE == 1
    1601             : ! avx512 block4 real kernel
    1602             : 
    1603             : #if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
    1604             : #ifndef WITH_FIXED_REAL_KERNEL
    1605      679968 :            if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4) then
    1606             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1607             : 
    1608             : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
    1609             :              ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
    1610      134976 :              do j = ncols, 4, -4
    1611      113088 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1612      113088 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1613      113088 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1614      113088 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1615             : #ifdef WITH_OPENMP
    1616             :                call quad_hh_trafo_&
    1617             :                     &MATH_DATATYPE&
    1618             :                     &_avx512_4hv_&
    1619             :                     &PRECISION&
    1620       56544 :                     & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1621             : #else
    1622             :                call quad_hh_trafo_&
    1623             :                     &MATH_DATATYPE&
    1624             :                     &_avx512_4hv_&
    1625             :                     &PRECISION&
    1626       56544 :                     & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1627             : #endif
    1628             :              enddo
    1629       29184 :              do jj = j, 2, -2
    1630        7296 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1631        7296 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1632             : #ifdef WITH_OPENMP
    1633             :                call double_hh_trafo_&
    1634             :                     &MATH_DATATYPE&
    1635             :                     &_avx512_2hv_&
    1636             :                     &PRECISION&
    1637        3648 :                     & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1638             : #else
    1639             :                call double_hh_trafo_&
    1640             :                     &MATH_DATATYPE&
    1641             :                     &_avx512_2hv_&
    1642             :                     &PRECISION&
    1643        3648 :                     & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1644             : #endif
    1645             :              enddo
    1646             : #ifdef WITH_OPENMP
    1647       10944 :              if (jj==1) call single_hh_trafo_&
    1648             :                   &MATH_DATATYPE&
    1649             :                   &_cpu_openmp_&
    1650             :                   &PRECISION&
    1651             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1652        1824 :                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1653             : #else
    1654       10944 :              if (jj==1) call single_hh_trafo_&
    1655             :                   &MATH_DATATYPE&
    1656             :                   &_cpu_&
    1657             :                   &PRECISION&
    1658             :                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
    1659        1824 :                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1660             : #endif
    1661             : 
    1662             : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
    1663             : 
    1664             : #ifndef WITH_FIXED_REAL_KERNEL
    1665             :            endif
    1666             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1667             : #endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
    1668             : 
    1669             : #endif /* REALCASE */
    1670             : 
    1671             : #if COMPLEXCASE == 1
    1672             : !no avx512 block4 complex kernel
    1673             : #endif /* COMPLEXCASE */
    1674             : 
    1675             : 
    1676             : #if REALCASE == 1
    1677             : !sparc64 block6 real kernel
    1678             : #if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
    1679             : #ifndef WITH_FIXED_REAL_KERNEL
    1680             :            if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6) then
    1681             : 
    1682             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1683             :              ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
    1684             :              do j = ncols, 6, -6
    1685             :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1686             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1687             :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1688             :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1689             :                w(:,5) = bcast_buffer(1:nbw,j+off-4)
    1690             :                w(:,6) = bcast_buffer(1:nbw,j+off-5)
    1691             : #ifdef WITH_OPENMP
    1692             :                call hexa_hh_trafo_&
    1693             :                     &MATH_DATATYPE&
    1694             :                     &_sparc64_6hv_&
    1695             :                     &PRECISION&
    1696             :                     & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1697             : #else
    1698             :                call hexa_hh_trafo_&
    1699             :                     &MATH_DATATYPE&
    1700             :                     &_sparc64_6hv_&
    1701             :                     &PRECISION&
    1702             :                     & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
    1703             : #endif
    1704             :              enddo
    1705             :              do jj = j, 4, -4
    1706             :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1707             :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1708             :                w(:,3) = bcast_buffer(1:nbw,jj+off-2)
    1709             :                w(:,4) = bcast_buffer(1:nbw,jj+off-3)
    1710             : #ifdef WITH_OPENMP
    1711             :                call quad_hh_trafo_&
    1712             :                     &MATH_DATATYPE&
    1713             :                     &_sparc64_4hv_&
    1714             :                     &PRECISION&
    1715             :                     & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1716             : #else
    1717             :                call quad_hh_trafo_&
    1718             :                     &MATH_DATATYPE&
    1719             :                     &_sparc64_4hv_&
    1720             :                     &PRECISION&
    1721             :                     & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
    1722             :                                                   nbw, nl, stripe_width, nbw)
    1723             : #endif
    1724             :              enddo
    1725             :              do jjj = jj, 2, -2
    1726             :                w(:,1) = bcast_buffer(1:nbw,jjj+off)
    1727             :                w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
    1728             : #ifdef WITH_OPENMP
    1729             :                call double_hh_trafo_&
    1730             :                     &MATH_DATATYPE&
    1731             :                     &_sparc64_2hv_&
    1732             :                     &PRECISION&
    1733             :                     & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1734             : #else
    1735             :                call double_hh_trafo_&
    1736             :                     &MATH_DATATYPE&
    1737             :                     &_sparc64_2hv_&
    1738             :                     &PRECISION&
    1739             :                     & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1740             : #endif
    1741             :              enddo
    1742             : #ifdef WITH_OPENMP
    1743             :              if (jjj==1) call single_hh_trafo_&
    1744             :                    &MATH_DATATYPE&
    1745             :                    &_cpu_openmp_&
    1746             :                    &PRECISION&
    1747             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1748             :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1749             : #else
    1750             :              if (jjj==1) call single_hh_trafo_&
    1751             :                    &MATH_DATATYPE&
    1752             :                    &_cpu_&
    1753             :                    &PRECISION&
    1754             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1755             : #endif
    1756             : #ifndef WITH_FIXED_REAL_KERNEL
    1757             :            endif
    1758             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1759             : #endif /* WITH_REAL_SPARC64_BLOCK6_KERNEL */
    1760             : 
    1761             : #endif /* REALCASE */
    1762             : 
    1763             : #if REALCASE == 1
    1764             : !vsx block6 real kernel
    1765             : #if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
    1766             : #ifndef WITH_FIXED_REAL_KERNEL
    1767             :            if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6) then
    1768             : 
    1769             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1770             :              ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
    1771             :              do j = ncols, 6, -6
    1772             :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1773             :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1774             :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1775             :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1776             :                w(:,5) = bcast_buffer(1:nbw,j+off-4)
    1777             :                w(:,6) = bcast_buffer(1:nbw,j+off-5)
    1778             : #ifdef WITH_OPENMP
    1779             :                call hexa_hh_trafo_&
    1780             :                     &MATH_DATATYPE&
    1781             :                     &_vsx_6hv_&
    1782             :                     &PRECISION&
    1783             :                     & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1784             : #else
    1785             :                call hexa_hh_trafo_&
    1786             :                     &MATH_DATATYPE&
    1787             :                     &_vsx_6hv_&
    1788             :                     &PRECISION&
    1789             :                     & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
    1790             : #endif
    1791             :              enddo
    1792             :              do jj = j, 4, -4
    1793             :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1794             :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1795             :                w(:,3) = bcast_buffer(1:nbw,jj+off-2)
    1796             :                w(:,4) = bcast_buffer(1:nbw,jj+off-3)
    1797             : #ifdef WITH_OPENMP
    1798             :                call quad_hh_trafo_&
    1799             :                     &MATH_DATATYPE&
    1800             :                     &_vsx_4hv_&
    1801             :                     &PRECISION&
    1802             :                     & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1803             : #else
    1804             :                call quad_hh_trafo_&
    1805             :                     &MATH_DATATYPE&
    1806             :                     &_vsx_4hv_&
    1807             :                     &PRECISION&
    1808             :                     & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
    1809             :                                                   nbw, nl, stripe_width, nbw)
    1810             : #endif
    1811             :              enddo
    1812             :              do jjj = jj, 2, -2
    1813             :                w(:,1) = bcast_buffer(1:nbw,jjj+off)
    1814             :                w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
    1815             : #ifdef WITH_OPENMP
    1816             :                call double_hh_trafo_&
    1817             :                     &MATH_DATATYPE&
    1818             :                     &_vsx_2hv_&
    1819             :                     &PRECISION&
    1820             :                     & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1821             : #else
    1822             :                call double_hh_trafo_&
    1823             :                     &MATH_DATATYPE&
    1824             :                     &_vsx_2hv_&
    1825             :                     &PRECISION&
    1826             :                     & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1827             : #endif
    1828             :              enddo
    1829             : #ifdef WITH_OPENMP
    1830             :              if (jjj==1) call single_hh_trafo_&
    1831             :                    &MATH_DATATYPE&
    1832             :                    &_cpu_openmp_&
    1833             :                    &PRECISION&
    1834             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1835             :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1836             : #else
    1837             :              if (jjj==1) call single_hh_trafo_&
    1838             :                    &MATH_DATATYPE&
    1839             :                    &_cpu_&
    1840             :                    &PRECISION&
    1841             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1842             : #endif
    1843             : #ifndef WITH_FIXED_REAL_KERNEL
    1844             :            endif
    1845             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1846             : #endif /* WITH_REAL_VSX_BLOCK6_KERNEL */
    1847             : 
    1848             : #endif /* REALCASE */
    1849             : 
    1850             : #if REALCASE == 1
    1851             : !sse block6 real kernel
    1852             : #if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
    1853             : #ifndef WITH_FIXED_REAL_KERNEL
    1854     1294272 :            if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK6) then
    1855             : 
    1856             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1857             :              ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
    1858      189696 :              do j = ncols, 6, -6
    1859      145920 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1860      145920 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1861      145920 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1862      145920 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1863      145920 :                w(:,5) = bcast_buffer(1:nbw,j+off-4)
    1864      145920 :                w(:,6) = bcast_buffer(1:nbw,j+off-5)
    1865             : #ifdef WITH_OPENMP
    1866             :                call hexa_hh_trafo_&
    1867             :                     &MATH_DATATYPE&
    1868             :                     &_sse_6hv_&
    1869             :                     &PRECISION&
    1870       72960 :                     & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1871             : #else
    1872             :                call hexa_hh_trafo_&
    1873             :                     &MATH_DATATYPE&
    1874             :                     &_sse_6hv_&
    1875             :                     &PRECISION&
    1876       72960 :                     & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
    1877             : #endif
    1878             :              enddo
    1879       56544 :              do jj = j, 4, -4
    1880       12768 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1881       12768 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1882       12768 :                w(:,3) = bcast_buffer(1:nbw,jj+off-2)
    1883       12768 :                w(:,4) = bcast_buffer(1:nbw,jj+off-3)
    1884             : #ifdef WITH_OPENMP
    1885             :                call quad_hh_trafo_&
    1886             :                     &MATH_DATATYPE&
    1887             :                     &_sse_4hv_&
    1888             :                     &PRECISION&
    1889        6384 :                     & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1890             : #else
    1891             :                call quad_hh_trafo_&
    1892             :                     &MATH_DATATYPE&
    1893             :                     &_sse_4hv_&
    1894             :                     &PRECISION&
    1895             :                     & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
    1896        6384 :                                                   nbw, nl, stripe_width, nbw)
    1897             : #endif
    1898             :              enddo
    1899       47424 :              do jjj = jj, 2, -2
    1900        3648 :                w(:,1) = bcast_buffer(1:nbw,jjj+off)
    1901        3648 :                w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
    1902             : #ifdef WITH_OPENMP
    1903             :                call double_hh_trafo_&
    1904             :                     &MATH_DATATYPE&
    1905             :                     &_sse_2hv_&
    1906             :                     &PRECISION&
    1907        1824 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1908             : #else
    1909             :                call double_hh_trafo_&
    1910             :                     &MATH_DATATYPE&
    1911             :                     &_sse_2hv_&
    1912             :                     &PRECISION&
    1913        1824 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    1914             : #endif
    1915             :              enddo
    1916             : #ifdef WITH_OPENMP
    1917       21888 :              if (jjj==1) call single_hh_trafo_&
    1918             :                    &MATH_DATATYPE&
    1919             :                    &_cpu_openmp_&
    1920             :                    &PRECISION&
    1921             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    1922        3648 :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1923             : #else
    1924       21888 :              if (jjj==1) call single_hh_trafo_&
    1925             :                    &MATH_DATATYPE&
    1926             :                    &_cpu_&
    1927             :                    &PRECISION&
    1928        3648 :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    1929             : #endif
    1930             : #ifndef WITH_FIXED_REAL_KERNEL
    1931             :            endif
    1932             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1933             : #endif /* WITH_REAL_SSE_BLOCK6_KERNEL */
    1934             : 
    1935             : #endif /* REALCASE */
    1936             : 
    1937             : #if COMPLEXCASE == 1
    1938             : ! no sse block6 complex kernel
    1939             : #endif
    1940             : 
    1941             : #if REALCASE == 1
    1942             : ! avx block6 real kernel
    1943             : 
    1944             : #if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
    1945             : #ifndef WITH_FIXED_REAL_KERNEL
    1946     1294272 :            if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK6) .or. &
    1947             :                (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK6)) then
    1948             : 
    1949             : #endif /* not WITH_FIXED_REAL_KERNEL */
    1950             :              ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
    1951      379392 :              do j = ncols, 6, -6
    1952      291840 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    1953      291840 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    1954      291840 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    1955      291840 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    1956      291840 :                w(:,5) = bcast_buffer(1:nbw,j+off-4)
    1957      291840 :                w(:,6) = bcast_buffer(1:nbw,j+off-5)
    1958             : #ifdef WITH_OPENMP
    1959             :                call hexa_hh_trafo_&
    1960             :                     &MATH_DATATYPE&
    1961             :                     &_avx_avx2_6hv_&
    1962             :                     &PRECISION&
    1963      145920 :                     & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1964             : #else
    1965             :                call hexa_hh_trafo_&
    1966             :                     &MATH_DATATYPE&
    1967             :                     &_avx_avx2_6hv_&
    1968             :                     &PRECISION&
    1969      145920 :                     & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
    1970             : #endif
    1971             :              enddo
    1972      113088 :              do jj = j, 4, -4
    1973       25536 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    1974       25536 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    1975       25536 :                w(:,3) = bcast_buffer(1:nbw,jj+off-2)
    1976       25536 :                w(:,4) = bcast_buffer(1:nbw,jj+off-3)
    1977             : #ifdef WITH_OPENMP
    1978             :                call quad_hh_trafo_&
    1979             :                     &MATH_DATATYPE&
    1980             :                     &_avx_avx2_4hv_&
    1981             :                     &PRECISION&
    1982       12768 :                     & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    1983             : #else
    1984             :                call quad_hh_trafo_&
    1985             :                     &MATH_DATATYPE&
    1986             :                     &_avx_avx2_4hv_&
    1987             :                     &PRECISION&
    1988       12768 :                     & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    1989             : #endif
    1990             :              enddo
    1991       94848 :              do jjj = jj, 2, -2
    1992        7296 :                w(:,1) = bcast_buffer(1:nbw,jjj+off)
    1993        7296 :                w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
    1994             : #ifdef WITH_OPENMP
    1995             :                call double_hh_trafo_&
    1996             :                     &MATH_DATATYPE&
    1997             :                     &_avx_avx2_2hv_&
    1998             :                     &PRECISION&
    1999        3648 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    2000             : #else
    2001             :                call double_hh_trafo_&
    2002             :                     &MATH_DATATYPE&
    2003             :                     &_avx_avx2_2hv_&
    2004             :                     &PRECISION&
    2005        3648 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    2006             : #endif
    2007             :              enddo
    2008             : #ifdef WITH_OPENMP
    2009       43776 :              if (jjj==1) call single_hh_trafo_&
    2010             :                    &MATH_DATATYPE&
    2011             :                    &_cpu_openmp_&
    2012             :                    &PRECISION&
    2013             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    2014        7296 :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    2015             : #else
    2016       43776 :              if (jjj==1) call single_hh_trafo_&
    2017             :                    &MATH_DATATYPE&
    2018             :                    &_cpu_&
    2019             :                    &PRECISION&
    2020             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
    2021        7296 :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    2022             : #endif
    2023             : #ifndef WITH_FIXED_REAL_KERNEL
    2024             :            endif
    2025             : #endif /* not WITH_FIXED_REAL_KERNEL */
    2026             : #endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
    2027             : 
    2028             : #endif /* REALCASE */
    2029             : 
    2030             : #if COMPLEXCASE == 1
    2031             : !no avx block6 complex kernel
    2032             : #endif
    2033             : 
    2034             : #if REALCASE == 1
    2035             : ! avx512 block6 kernel
    2036             : #if defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
    2037             : #ifndef WITH_FIXED_REAL_KERNEL
    2038      679968 :            if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6)) then
    2039             : #endif /* not WITH_FIXED_REAL_KERNEL */
    2040             :              ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
    2041             : 
    2042       94848 :              do j = ncols, 6, -6
    2043       72960 :                w(:,1) = bcast_buffer(1:nbw,j+off)
    2044       72960 :                w(:,2) = bcast_buffer(1:nbw,j+off-1)
    2045       72960 :                w(:,3) = bcast_buffer(1:nbw,j+off-2)
    2046       72960 :                w(:,4) = bcast_buffer(1:nbw,j+off-3)
    2047       72960 :                w(:,5) = bcast_buffer(1:nbw,j+off-4)
    2048       72960 :                w(:,6) = bcast_buffer(1:nbw,j+off-5)
    2049             : #ifdef WITH_OPENMP
    2050             :                call hexa_hh_trafo_&
    2051             :                     &MATH_DATATYPE&
    2052             :                     &_avx512_6hv_&
    2053             :                     &PRECISION&
    2054       36480 :                     & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    2055             : #else
    2056             :                call hexa_hh_trafo_&
    2057             :                     &MATH_DATATYPE&
    2058             :                     &_avx512_6hv_&
    2059             :                     &PRECISION&
    2060       36480 :                     & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
    2061             : #endif
    2062             :              enddo
    2063       28272 :              do jj = j, 4, -4
    2064        6384 :                w(:,1) = bcast_buffer(1:nbw,jj+off)
    2065        6384 :                w(:,2) = bcast_buffer(1:nbw,jj+off-1)
    2066        6384 :                w(:,3) = bcast_buffer(1:nbw,jj+off-2)
    2067        6384 :                w(:,4) = bcast_buffer(1:nbw,jj+off-3)
    2068             : #ifdef WITH_OPENMP
    2069             :                call quad_hh_trafo_&
    2070             :                     &MATH_DATATYPE&
    2071             :                     &_avx512_4hv_&
    2072             :                     &PRECISION&
    2073        3192 :                     & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    2074             : #else
    2075             :                call quad_hh_trafo_&
    2076             :                     &MATH_DATATYPE&
    2077             :                     &_avx512_4hv_&
    2078             :                     &PRECISION&
    2079        3192 :                     & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
    2080             : #endif
    2081             :              enddo
    2082       23712 :              do jjj = jj, 2, -2
    2083        1824 :                w(:,1) = bcast_buffer(1:nbw,jjj+off)
    2084        1824 :                w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
    2085             : #ifdef WITH_OPENMP
    2086             :                call double_hh_trafo_&
    2087             :                     &MATH_DATATYPE&
    2088             :                     &_avx512_2hv_&
    2089             :                     &PRECISION&
    2090         912 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
    2091             : #else
    2092             :                call double_hh_trafo_&
    2093             :                     &MATH_DATATYPE&
    2094             :                     &_avx512_2hv_&
    2095             :                     &PRECISION&
    2096         912 :                     & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
    2097             : #endif
    2098             :              enddo
    2099             : #ifdef WITH_OPENMP
    2100       10944 :              if (jjj==1) call single_hh_trafo_&
    2101             :                    &MATH_DATATYPE&
    2102             :                    &_cpu_openmp_&
    2103             :                    &PRECISION&
    2104             :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
    2105        1824 :                                            bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    2106             : #else
    2107       10944 :              if (jjj==1) call single_hh_trafo_&
    2108             :                    &MATH_DATATYPE&
    2109             :                    &_cpu_&
    2110             :                    &PRECISION&
    2111        1824 :                    & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
    2112             : #endif
    2113             : #ifndef WITH_FIXED_REAL_KERNEL
    2114             :            endif
    2115             : #endif /* not WITH_FIXED_REAL_KERNEL */
    2116             : #endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */
    2117             : 
    2118             : #endif /* REALCASE */
    2119             : 
    2120             : #if COMPLEXCASE == 1
    2121             : !no avx512 block6 complex kernel
    2122             : #endif /* COMPLEXCASE */
    2123             : 
    2124     3491712 :            if (wantDebug) then
    2125           0 :              call obj%timer%stop("compute_hh_trafo: CPU")
    2126             :            endif
    2127             :          endif ! GPU_KERNEL
    2128             : 
    2129             : #ifdef WITH_OPENMP
    2130     1745856 :          if (my_thread==1) then
    2131             : #endif
    2132     3491712 :            kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
    2133     3491712 :            kernel_time = kernel_time + mpi_wtime()-ttt
    2134     3491712 :      n_times = n_times + 1
    2135             : #ifdef WITH_OPENMP
    2136             :          endif
    2137             : #endif
    2138             : 
    2139     3491712 :          if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
    2140             :          &MATH_DATATYPE&
    2141             : #ifdef WITH_OPENMP
    2142             :          &_openmp" // &
    2143             : #else
    2144             :          &" // &
    2145             : #endif
    2146             :          &PRECISION_SUFFIX &
    2147           0 :          )
    2148             : 
    2149     3491712 :        end subroutine
    2150             : 
    2151             : ! vim: syntax=fortran

Generated by: LCOV version 1.12