LCOV - code coverage report
Current view: top level - src/elpa2/kernels - real_avx-avx2_2hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 430 634 67.8 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : //
      46             : // --------------------------------------------------------------------------------------------------
      47             : //
      48             : // This file contains the compute intensive kernels for the Householder transformations.
      49             : // It should be compiled with the highest possible optimization level.
      50             : //
      51             : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
      52             : // On Intel Sandy Bridge use -O3 -mavx
      53             : //
      54             : // Copyright of the original code rests with the authors inside the ELPA
      55             : // consortium. The copyright of any additional modifications shall rest
      56             : // with their original authors, but shall adhere to the licensing terms
      57             : // distributed along with the original code in the file "COPYING".
      58             : //
      59             : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
      60             : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
      61             : // --------------------------------------------------------------------------------------------------
      62             : 
      63             : #include "config-f90.h"
      64             : 
      65             : #include <x86intrin.h>
      66             : #include <stdio.h>
      67             : #include <stdlib.h>
      68             : 
      69             : #define __forceinline __attribute__((always_inline)) static
      70             : 
      71             : #ifdef DOUBLE_PRECISION_REAL
      72             : 
      73             : #define offset 4
      74             : #define __AVX_DATATYPE __m256d
      75             : #define _AVX_LOAD _mm256_load_pd
      76             : #define _AVX_STORE _mm256_store_pd
      77             : #define _AVX_ADD _mm256_add_pd
      78             : #define _AVX_MUL _mm256_mul_pd
      79             : #define _AVX_BROADCAST _mm256_broadcast_sd
      80             : #define _AVX_XOR _mm256_xor_pd
      81             : 
      82             : #ifdef HAVE_AVX2
      83             : #ifdef __FMA4__
      84             : #define __ELPA_USE_FMA__
      85             : #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
      86             : #endif
      87             : 
      88             : #ifdef __AVX2__
      89             : #define __ELPA_USE_FMA__
      90             : #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
      91             : #endif
      92             : #endif
      93             : 
      94             : #define _AVX_FMA _mm256_FMA_pd
      95             : 
      96             : #endif /* DOUBLE_PRECISION_REAL */
      97             : 
      98             : #ifdef SINGLE_PRECISION_REAL
      99             : 
     100             : #define offset 8
     101             : #define __AVX_DATATYPE __m256
     102             : #define _AVX_LOAD _mm256_load_ps
     103             : #define _AVX_STORE _mm256_store_ps
     104             : #define _AVX_ADD _mm256_add_ps
     105             : #define _AVX_MUL _mm256_mul_ps
     106             : #define _AVX_BROADCAST _mm256_broadcast_ss
     107             : #define _AVX_XOR _mm256_xor_ps
     108             : 
     109             : #ifdef HAVE_AVX2
     110             : #ifdef __FMA4__
     111             : #define __ELPA_USE_FMA__
     112             : #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
     113             : #endif
     114             : 
     115             : #ifdef __AVX2__
     116             : #define __ELPA_USE_FMA__
     117             : #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
     118             : #endif
     119             : #endif
     120             : 
     121             : #define _AVX_FMA _mm256_FMA_ps
     122             : #endif /* SINGLE_PRECISION_REAL */
     123             : 
     124             : #ifdef DOUBLE_PRECISION_REAL
     125             : //Forward declaration
     126             : __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     127             : __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     128             : __forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     129             : __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     130             : __forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     131             : __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     132             : #endif
     133             : 
     134             : #ifdef SINGLE_PRECISION_REAL
     135             : //Forward declaration
     136             : __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     137             : __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     138             : __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     139             : __forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     140             : __forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     141             : __forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     142             : #endif
     143             : 
     144             : #ifdef DOUBLE_PRECISION_REAL
     145             : void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     146             : #endif
     147             : #ifdef SINGLE_PRECISION_REAL
     148             : void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     149             : #endif
     150             : 
     151             : #ifdef DOUBLE_PRECISION_REAL
     152             : /*
     153             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     154             : !f> interface
     155             : !f>   subroutine double_hh_trafo_real_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
     156             : !f>                             bind(C, name="double_hh_trafo_real_avx_avx2_2hv_double")
     157             : !f>     use, intrinsic :: iso_c_binding
     158             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     159             : !f>     type(c_ptr), value      :: q
     160             : !f>     real(kind=c_double)     :: hh(pnb,6)
     161             : !f>   end subroutine
     162             : !f> end interface
     163             : !f>#endif
     164             : */
     165             : #endif
     166             : #ifdef SINGLE_PRECISION_REAL
     167             : /*
     168             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     169             : !f> interface
     170             : !f>   subroutine double_hh_trafo_real_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
     171             : !f>                             bind(C, name="double_hh_trafo_real_avx_avx2_2hv_single")
     172             : !f>     use, intrinsic :: iso_c_binding
     173             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     174             : !f>     type(c_ptr), value      :: q
     175             : !f>     real(kind=c_float)      :: hh(pnb,6)
     176             : !f>   end subroutine
     177             : !f> end interface
     178             : !f>#endif
     179             : */
     180             : #endif
     181             : 
     182             : #ifdef DOUBLE_PRECISION_REAL
     183    19579648 : void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     184             : #endif
     185             : #ifdef SINGLE_PRECISION_REAL
     186     3595968 : void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     187             : #endif
     188             : {
     189             :         int i;
     190    23175616 :         int nb = *pnb;
     191    23175616 :         int nq = *pldq;
     192    23175616 :         int ldq = *pldq;
     193    23175616 :         int ldh = *pldh;
     194             :         int worked_on;
     195             : 
     196    23175616 :         worked_on = 0;
     197             : 
     198             :         // calculating scalar product to compute
     199             :         // 2 householder vectors simultaneously
     200             : #ifdef DOUBLE_PRECISION_REAL
     201    19579648 :         double s = hh[(ldh)+1]*1.0;
     202             : #endif
     203             : #ifdef SINGLE_PRECISION_REAL
     204     3595968 :         float s = hh[(ldh)+1]*1.0f;
     205             : #endif
     206             :         #pragma ivdep
     207  1448221248 :         for (i = 2; i < nb; i++)
     208             :         {
     209  1425045632 :                 s += hh[i-1] * hh[(i+ldh)];
     210             :         }
     211             : 
     212             :         // Production level kernel calls with padding
     213             : #ifdef DOUBLE_PRECISION_REAL
     214    56731136 :         for (i = 0; i < nq-20; i+=24)
     215             :         {
     216    37151488 :                 hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     217    37151488 :                 worked_on += 24;
     218             :         }
     219             : #endif
     220             : #ifdef SINGLE_PRECISION_REAL
     221     7191936 :         for (i = 0; i < nq-40; i+=48)
     222             :         {
     223     3595968 :                 hh_trafo_kernel_48_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     224     3595968 :                 worked_on += 48;
     225             :         }
     226             : #endif
     227             : 
     228    23175616 :         if (nq == i)
     229             :         {
     230    17571840 :                 return;
     231             :         }
     232             : #ifdef DOUBLE_PRECISION_REAL
     233     2007808 :         if (nq-i == 20)
     234             :         {
     235           0 :                 hh_trafo_kernel_20_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     236           0 :                 worked_on += 20;
     237             :         }
     238             : #endif
     239             : 
     240             : #ifdef SINGLE_PRECISION_REAL
     241     3595968 :         if (nq-i == 40)
     242             :         {
     243     3194880 :                 hh_trafo_kernel_40_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     244     3194880 :                 worked_on += 40;
     245             :         }
     246             : #endif
     247             : 
     248             : 
     249             : #ifdef DOUBLE_PRECISION_REAL
     250     2007808 :         if (nq-i == 16)
     251             :         {
     252     2007808 :                 hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     253     2007808 :                 worked_on += 16;
     254             :         }
     255             : #endif
     256             : 
     257             : #ifdef SINGLE_PRECISION_REAL
     258     3595968 :         if (nq-i == 32)
     259             :         {
     260      401088 :                 hh_trafo_kernel_32_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     261      401088 :                 worked_on += 32;
     262             :         }
     263             : #endif
     264             : 
     265             : 
     266             : #ifdef DOUBLE_PRECISION_REAL
     267     2007808 :         if (nq-i == 12)
     268             :         {
     269           0 :                 hh_trafo_kernel_12_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     270           0 :                 worked_on += 12;
     271             :         }
     272             : #endif
     273             : 
     274             : #ifdef SINGLE_PRECISION_REAL
     275     3595968 :         if (nq-i == 24)
     276             :         {
     277           0 :                 hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     278           0 :                 worked_on += 24;
     279             :         }
     280             : #endif
     281             : 
     282             : #ifdef DOUBLE_PRECISION_REAL
     283     2007808 :         if (nq-i == 8)
     284             :         {
     285           0 :                 hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     286           0 :                 worked_on += 8;
     287             :         }
     288             : #endif
     289             : 
     290             : #ifdef SINGLE_PRECISION_REAL
     291     3595968 :         if (nq-i == 16)
     292             :         {
     293           0 :                 hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     294           0 :                 worked_on += 16;
     295             :         }
     296             : #endif
     297             : 
     298             : #ifdef DOUBLE_PRECISION_REAL
     299     2007808 :         if (nq-i == 4)
     300             :         {
     301           0 :                 hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     302           0 :                 worked_on += 4;
     303             :         }
     304             : #endif
     305             : 
     306             : #ifdef SINGLE_PRECISION_REAL
     307     3595968 :         if (nq-i == 8)
     308             :         {
     309           0 :                 hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     310           0 :                 worked_on += 8;
     311             :         }
     312             : #endif
     313             : #ifdef WITH_DEBUG
     314             :         if (worked_on != nq)
     315             :         {
     316             :            printf("Error in real avx/avx2 BLOCK 2 kernel \n");
     317             :            abort();
     318             :         }
     319             : #endif
     320             : }
     321             : 
     322             : /**
     323             :  * Unrolled kernel that computes
     324             : #ifdef DOUBLE_PRECISION_REAL
     325             :  * 24 rows of Q simultaneously, a
     326             : #endif
     327             : #ifdef SINGLE_PRECISION_REAL
     328             :  * 48 rows of Q simultaneously, a
     329             : #endif
     330             :  * matrix Vector product with two householder
     331             :  * vectors + a rank 2 update is performed
     332             :  */
     333             : #ifdef DOUBLE_PRECISION_REAL
     334             :  __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     335             : #endif
     336             : #ifdef SINGLE_PRECISION_REAL
     337             :  __forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     338             : #endif
     339             : {
     340             :         /////////////////////////////////////////////////////
     341             :         // Matrix Vector Multiplication, Q [24 x nb+1] * hh
     342             :         // hh contains two householder vectors, with offset 1
     343             :         /////////////////////////////////////////////////////
     344             :         int i;
     345             :         // Needed bit mask for floating point sign flip
     346             : #ifdef DOUBLE_PRECISION_REAL
     347    37151488 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
     348             : #endif
     349             : #ifdef SINGLE_PRECISION_REAL
     350     3595968 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
     351             : #endif
     352             : 
     353    81494912 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
     354    81494912 :         __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
     355    81494912 :         __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
     356    81494912 :         __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
     357    81494912 :         __AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
     358    81494912 :         __AVX_DATATYPE x6 = _AVX_LOAD(&q[ldq+5*offset]);
     359             : 
     360    81494912 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
     361             :         __AVX_DATATYPE h2;
     362             : 
     363             : #ifdef __ELPA_USE_FMA__
     364    40747456 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
     365    40747456 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
     366    81494912 :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
     367    40747456 :         __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
     368    81494912 :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
     369    40747456 :         __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
     370    81494912 :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
     371    40747456 :         __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
     372    81494912 :         __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
     373    40747456 :         __AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
     374    81494912 :         __AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
     375    40747456 :         __AVX_DATATYPE y6 = _AVX_FMA(x6, h1, q6);
     376             : 
     377             : #else
     378             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
     379             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
     380             :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
     381             :         __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
     382             :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
     383             :         __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
     384             :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
     385             :         __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
     386             :         __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
     387             :         __AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
     388             :         __AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
     389             :         __AVX_DATATYPE y6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
     390             : 
     391             : #endif
     392             : 
     393  2555247168 :         for(i = 2; i < nb; i++)
     394             :         {
     395  5028999424 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
     396  5028999424 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
     397             : #ifdef __ELPA_USE_FMA__
     398  5028999424 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     399  2514499712 :                 x1 = _AVX_FMA(q1, h1, x1);
     400  2514499712 :                 y1 = _AVX_FMA(q1, h2, y1);
     401  5028999424 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     402  2514499712 :                 x2 = _AVX_FMA(q2, h1, x2);
     403  2514499712 :                 y2 = _AVX_FMA(q2, h2, y2);
     404  5028999424 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     405  2514499712 :                 x3 = _AVX_FMA(q3, h1, x3);
     406  2514499712 :                 y3 = _AVX_FMA(q3, h2, y3);
     407  5028999424 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     408  2514499712 :                 x4 = _AVX_FMA(q4, h1, x4);
     409  2514499712 :                 y4 = _AVX_FMA(q4, h2, y4);
     410  5028999424 :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     411  2514499712 :                 x5 = _AVX_FMA(q5, h1, x5);
     412  2514499712 :                 y5 = _AVX_FMA(q5, h2, y5);
     413  5028999424 :                 q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
     414  2514499712 :                 x6 = _AVX_FMA(q6, h1, x6);
     415  2514499712 :                 y6 = _AVX_FMA(q6, h2, y6);
     416             : 
     417             : #else
     418             :                 q1 = _AVX_LOAD(&q[i*ldq]);
     419             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     420             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     421             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     422             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     423             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     424             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     425             :                 x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
     426             :                 y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
     427             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     428             :                 x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
     429             :                 y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
     430             :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     431             :                 x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
     432             :                 y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
     433             :                 q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
     434             :                 x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
     435             :                 y6 = _AVX_ADD(y6, _AVX_MUL(q6,h2));
     436             : 
     437             : #endif
     438             :         }
     439             : 
     440    81494912 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
     441             : #ifdef __ELPA_USE_FMA__
     442    81494912 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     443    40747456 :         x1 = _AVX_FMA(q1, h1, x1);
     444    81494912 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     445    40747456 :         x2 = _AVX_FMA(q2, h1, x2);
     446    81494912 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     447    40747456 :         x3 = _AVX_FMA(q3, h1, x3);
     448    81494912 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     449    40747456 :         x4 = _AVX_FMA(q4, h1, x4);
     450    81494912 :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     451    40747456 :         x5 = _AVX_FMA(q5, h1, x5);
     452    81494912 :         q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
     453    40747456 :         x6 = _AVX_FMA(q6, h1, x6);
     454             : 
     455             : #else
     456             :         q1 = _AVX_LOAD(&q[nb*ldq]);
     457             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     458             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     459             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     460             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     461             :         x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
     462             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     463             :         x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
     464             :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     465             :         x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
     466             :         q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
     467             :         x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
     468             : 
     469             : #endif
     470             : 
     471             :         /////////////////////////////////////////////////////
     472             :         // Rank-2 update of Q [24 x nb+1]
     473             :         /////////////////////////////////////////////////////
     474             : 
     475    40747456 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
     476    81494912 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
     477    40747456 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
     478             : 
     479    40747456 :         h1 = _AVX_XOR(tau1, sign);
     480    40747456 :         x1 = _AVX_MUL(x1, h1);
     481    40747456 :         x2 = _AVX_MUL(x2, h1);
     482    40747456 :         x3 = _AVX_MUL(x3, h1);
     483    40747456 :         x4 = _AVX_MUL(x4, h1);
     484    40747456 :         x5 = _AVX_MUL(x5, h1);
     485    40747456 :         x6 = _AVX_MUL(x6, h1);
     486             : 
     487    40747456 :         h1 = _AVX_XOR(tau2, sign);
     488    40747456 :         h2 = _AVX_MUL(h1, vs);
     489             : #ifdef __ELPA_USE_FMA__
     490    81494912 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
     491    81494912 :         y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
     492    81494912 :         y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
     493    81494912 :         y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
     494    81494912 :         y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
     495    81494912 :         y6 = _AVX_FMA(y6, h1, _AVX_MUL(x6,h2));
     496             : 
     497             : #else
     498             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
     499             :         y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
     500             :         y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
     501             :         y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
     502             :         y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
     503             :         y6 = _AVX_ADD(_AVX_MUL(y6,h1), _AVX_MUL(x6,h2));
     504             : 
     505             : #endif
     506             : 
     507    40747456 :         q1 = _AVX_LOAD(q);
     508    40747456 :         q1 = _AVX_ADD(q1, y1);
     509             :         _AVX_STORE(q,q1);
     510    81494912 :         q2 = _AVX_LOAD(&q[offset]);
     511    40747456 :         q2 = _AVX_ADD(q2, y2);
     512    40747456 :         _AVX_STORE(&q[offset],q2);
     513    81494912 :         q3 = _AVX_LOAD(&q[2*offset]);
     514    40747456 :         q3 = _AVX_ADD(q3, y3);
     515    40747456 :         _AVX_STORE(&q[2*offset],q3);
     516    81494912 :         q4 = _AVX_LOAD(&q[3*offset]);
     517    40747456 :         q4 = _AVX_ADD(q4, y4);
     518    40747456 :         _AVX_STORE(&q[3*offset],q4);
     519    81494912 :         q5 = _AVX_LOAD(&q[4*offset]);
     520    40747456 :         q5 = _AVX_ADD(q5, y5);
     521    40747456 :         _AVX_STORE(&q[4*offset],q5);
     522    81494912 :         q6 = _AVX_LOAD(&q[5*offset]);
     523    40747456 :         q6 = _AVX_ADD(q6, y6);
     524    40747456 :         _AVX_STORE(&q[5*offset],q6);
     525             : 
     526    81494912 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
     527             : #ifdef __ELPA_USE_FMA__
     528    81494912 :         q1 = _AVX_LOAD(&q[ldq]);
     529    81494912 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
     530    40747456 :         _AVX_STORE(&q[ldq],q1);
     531    81494912 :         q2 = _AVX_LOAD(&q[ldq+offset]);
     532    81494912 :         q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
     533    40747456 :         _AVX_STORE(&q[ldq+offset],q2);
     534    81494912 :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
     535    81494912 :         q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
     536    40747456 :         _AVX_STORE(&q[ldq+2*offset],q3);
     537    81494912 :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
     538    81494912 :         q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
     539    40747456 :         _AVX_STORE(&q[ldq+3*offset],q4);
     540    81494912 :         q5 = _AVX_LOAD(&q[ldq+4*offset]);
     541    81494912 :         q5 = _AVX_ADD(q5, _AVX_FMA(y5, h2, x5));
     542    40747456 :         _AVX_STORE(&q[ldq+4*offset],q5);
     543    81494912 :         q6 = _AVX_LOAD(&q[ldq+5*offset]);
     544    81494912 :         q6 = _AVX_ADD(q6, _AVX_FMA(y6, h2, x6));
     545    40747456 :         _AVX_STORE(&q[ldq+5*offset],q6);
     546             : 
     547             : #else
     548             :         q1 = _AVX_LOAD(&q[ldq]);
     549             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
     550             :         _AVX_STORE(&q[ldq],q1);
     551             :         q2 = _AVX_LOAD(&q[ldq+offset]);
     552             :         q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
     553             :         _AVX_STORE(&q[ldq+offset],q2);
     554             :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
     555             :         q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
     556             :         _AVX_STORE(&q[ldq+2*offset],q3);
     557             :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
     558             :         q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
     559             :         _AVX_STORE(&q[ldq+3*offset],q4);
     560             :         q5 = _AVX_LOAD(&q[ldq+4*offset]);
     561             :         q5 = _AVX_ADD(q5, _AVX_ADD(x5, _AVX_MUL(y5, h2)));
     562             :         _AVX_STORE(&q[ldq+4*offset],q5);
     563             :         q6 = _AVX_LOAD(&q[ldq+5*offset]);
     564             :         q6 = _AVX_ADD(q6, _AVX_ADD(x6, _AVX_MUL(y6, h2)));
     565             :         _AVX_STORE(&q[ldq+5*offset],q6);
     566             : 
     567             : #endif
     568             : 
     569  2555247168 :         for (i = 2; i < nb; i++)
     570             :         {
     571  5028999424 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
     572  5028999424 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
     573             : #ifdef __ELPA_USE_FMA__
     574  5028999424 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     575  2514499712 :                 q1 = _AVX_FMA(x1, h1, q1);
     576  2514499712 :                 q1 = _AVX_FMA(y1, h2, q1);
     577  2514499712 :                 _AVX_STORE(&q[i*ldq],q1);
     578  5028999424 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     579  2514499712 :                 q2 = _AVX_FMA(x2, h1, q2);
     580  2514499712 :                 q2 = _AVX_FMA(y2, h2, q2);
     581  2514499712 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
     582  5028999424 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     583  2514499712 :                 q3 = _AVX_FMA(x3, h1, q3);
     584  2514499712 :                 q3 = _AVX_FMA(y3, h2, q3);
     585  2514499712 :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
     586  5028999424 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     587  2514499712 :                 q4 = _AVX_FMA(x4, h1, q4);
     588  2514499712 :                 q4 = _AVX_FMA(y4, h2, q4);
     589  2514499712 :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
     590  5028999424 :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     591  2514499712 :                 q5 = _AVX_FMA(x5, h1, q5);
     592  2514499712 :                 q5 = _AVX_FMA(y5, h2, q5);
     593  2514499712 :                 _AVX_STORE(&q[(i*ldq)+4*offset],q5);
     594  5028999424 :                 q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
     595  2514499712 :                 q6 = _AVX_FMA(x6, h1, q6);
     596  2514499712 :                 q6 = _AVX_FMA(y6, h2, q6);
     597  2514499712 :                 _AVX_STORE(&q[(i*ldq)+5*offset],q6);
     598             : 
     599             : #else
     600             :                 q1 = _AVX_LOAD(&q[i*ldq]);
     601             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
     602             :                 _AVX_STORE(&q[i*ldq],q1);
     603             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     604             :                 q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
     605             :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
     606             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     607             :                 q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
     608             :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
     609             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     610             :                 q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
     611             :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
     612             :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     613             :                 q5 = _AVX_ADD(q5, _AVX_ADD(_AVX_MUL(x5,h1), _AVX_MUL(y5, h2)));
     614             :                 _AVX_STORE(&q[(i*ldq)+4*offset],q5);
     615             :                 q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
     616             :                 q6 = _AVX_ADD(q6, _AVX_ADD(_AVX_MUL(x6,h1), _AVX_MUL(y6, h2)));
     617             :                 _AVX_STORE(&q[(i*ldq)+5*offset],q6);
     618             : 
     619             : #endif
     620             :         }
     621             : 
     622    81494912 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
     623             : #ifdef __ELPA_USE_FMA__
     624    81494912 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     625    40747456 :         q1 = _AVX_FMA(x1, h1, q1);
     626    40747456 :         _AVX_STORE(&q[nb*ldq],q1);
     627    81494912 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     628    40747456 :         q2 = _AVX_FMA(x2, h1, q2);
     629    40747456 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
     630    81494912 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     631    40747456 :         q3 = _AVX_FMA(x3, h1, q3);
     632    40747456 :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
     633    81494912 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     634    40747456 :         q4 = _AVX_FMA(x4, h1, q4);
     635    40747456 :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
     636    81494912 :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     637    40747456 :         q5 = _AVX_FMA(x5, h1, q5);
     638    40747456 :         _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
     639    81494912 :         q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
     640    40747456 :         q6 = _AVX_FMA(x6, h1, q6);
     641    40747456 :         _AVX_STORE(&q[(nb*ldq)+5*offset],q6);
     642             : 
     643             : #else
     644             :         q1 = _AVX_LOAD(&q[nb*ldq]);
     645             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
     646             :         _AVX_STORE(&q[nb*ldq],q1);
     647             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     648             :         q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
     649             :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
     650             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     651             :         q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
     652             :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
     653             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     654             :         q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
     655             :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
     656             :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     657             :         q5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
     658             :         _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
     659             :         q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
     660             :         q6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
     661             :         _AVX_STORE(&q[(nb*ldq)+5*offset],q6);
     662             : 
     663             : #endif
     664             : }
     665             : 
     666             : 
     667             : /**
     668             :  * Unrolled kernel that computes
     669             : #ifdef DOUBLE_PRECISION_REAL
     670             :  * 20 rows of Q simultaneously, a
     671             : #endif
     672             : #ifdef SINGLE_PRECISION_REAL
     673             :  * 40 rows of Q simultaneously, a
     674             : #endif
     675             :  * matrix Vector product with two householder
     676             :  * vectors + a rank 2 update is performed
     677             :  */
     678             : #ifdef DOUBLE_PRECISION_REAL
     679             :  __forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     680             : #endif
     681             : #ifdef SINGLE_PRECISION_REAL
     682             :  __forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     683             : #endif
     684             : {
     685             :         /////////////////////////////////////////////////////
     686             :         // Matrix Vector Multiplication, Q [24 x nb+1] * hh
     687             :         // hh contains two householder vectors, with offset 1
     688             :         /////////////////////////////////////////////////////
     689             :         int i;
     690             :         // Needed bit mask for floating point sign flip
     691             : #ifdef DOUBLE_PRECISION_REAL
     692           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
     693             : #endif
     694             : #ifdef SINGLE_PRECISION_REAL
     695     3194880 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
     696             : #endif
     697             : 
     698     6389760 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
     699     6389760 :         __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
     700     6389760 :         __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
     701     6389760 :         __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
     702     6389760 :         __AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
     703             : 
     704     6389760 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
     705             :         __AVX_DATATYPE h2;
     706             : 
     707             : #ifdef __ELPA_USE_FMA__
     708     3194880 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
     709     3194880 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
     710     6389760 :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
     711     3194880 :         __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
     712     6389760 :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
     713     3194880 :         __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
     714     6389760 :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
     715     3194880 :         __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
     716     6389760 :         __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
     717     3194880 :         __AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
     718             : 
     719             : #else
     720             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
     721             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
     722             :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
     723             :         __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
     724             :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
     725             :         __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
     726             :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
     727             :         __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
     728             :         __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
     729             :         __AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
     730             : 
     731             : #endif
     732             : 
     733   201277440 :         for(i = 2; i < nb; i++)
     734             :         {
     735   396165120 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
     736   396165120 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
     737             : #ifdef __ELPA_USE_FMA__
     738   396165120 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     739   198082560 :                 x1 = _AVX_FMA(q1, h1, x1);
     740   198082560 :                 y1 = _AVX_FMA(q1, h2, y1);
     741   396165120 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     742   198082560 :                 x2 = _AVX_FMA(q2, h1, x2);
     743   198082560 :                 y2 = _AVX_FMA(q2, h2, y2);
     744   396165120 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     745   198082560 :                 x3 = _AVX_FMA(q3, h1, x3);
     746   198082560 :                 y3 = _AVX_FMA(q3, h2, y3);
     747   396165120 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     748   198082560 :                 x4 = _AVX_FMA(q4, h1, x4);
     749   198082560 :                 y4 = _AVX_FMA(q4, h2, y4);
     750   396165120 :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     751   198082560 :                 x5 = _AVX_FMA(q5, h1, x5);
     752   198082560 :                 y5 = _AVX_FMA(q5, h2, y5);
     753             : 
     754             : #else
     755             :                 q1 = _AVX_LOAD(&q[i*ldq]);
     756             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     757             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     758             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     759             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     760             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     761             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     762             :                 x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
     763             :                 y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
     764             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     765             :                 x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
     766             :                 y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
     767             :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     768             :                 x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
     769             :                 y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
     770             : 
     771             : #endif
     772             :         }
     773             : 
     774     6389760 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
     775             : #ifdef __ELPA_USE_FMA__
     776     6389760 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     777     3194880 :         x1 = _AVX_FMA(q1, h1, x1);
     778     6389760 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     779     3194880 :         x2 = _AVX_FMA(q2, h1, x2);
     780     6389760 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     781     3194880 :         x3 = _AVX_FMA(q3, h1, x3);
     782     6389760 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     783     3194880 :         x4 = _AVX_FMA(q4, h1, x4);
     784     6389760 :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     785     3194880 :         x5 = _AVX_FMA(q5, h1, x5);
     786             : 
     787             : #else
     788             :         q1 = _AVX_LOAD(&q[nb*ldq]);
     789             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     790             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     791             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     792             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     793             :         x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
     794             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     795             :         x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
     796             :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     797             :         x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
     798             : 
     799             : #endif
     800             : 
     801             :         /////////////////////////////////////////////////////
     802             :         // Rank-2 update of Q [24 x nb+1]
     803             :         /////////////////////////////////////////////////////
     804             : 
     805     3194880 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
     806     6389760 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
     807     3194880 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
     808             : 
     809     3194880 :         h1 = _AVX_XOR(tau1, sign);
     810     3194880 :         x1 = _AVX_MUL(x1, h1);
     811     3194880 :         x2 = _AVX_MUL(x2, h1);
     812     3194880 :         x3 = _AVX_MUL(x3, h1);
     813     3194880 :         x4 = _AVX_MUL(x4, h1);
     814     3194880 :         x5 = _AVX_MUL(x5, h1);
     815             : 
     816     3194880 :         h1 = _AVX_XOR(tau2, sign);
     817     3194880 :         h2 = _AVX_MUL(h1, vs);
     818             : #ifdef __ELPA_USE_FMA__
     819     6389760 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
     820     6389760 :         y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
     821     6389760 :         y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
     822     6389760 :         y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
     823     6389760 :         y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
     824             : 
     825             : #else
     826             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
     827             :         y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
     828             :         y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
     829             :         y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
     830             :         y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
     831             : 
     832             : #endif
     833             : 
     834     3194880 :         q1 = _AVX_LOAD(q);
     835     3194880 :         q1 = _AVX_ADD(q1, y1);
     836             :         _AVX_STORE(q,q1);
     837     6389760 :         q2 = _AVX_LOAD(&q[offset]);
     838     3194880 :         q2 = _AVX_ADD(q2, y2);
     839     3194880 :         _AVX_STORE(&q[offset],q2);
     840     6389760 :         q3 = _AVX_LOAD(&q[2*offset]);
     841     3194880 :         q3 = _AVX_ADD(q3, y3);
     842     3194880 :         _AVX_STORE(&q[2*offset],q3);
     843     6389760 :         q4 = _AVX_LOAD(&q[3*offset]);
     844     3194880 :         q4 = _AVX_ADD(q4, y4);
     845     3194880 :         _AVX_STORE(&q[3*offset],q4);
     846     6389760 :         q5 = _AVX_LOAD(&q[4*offset]);
     847     3194880 :         q5 = _AVX_ADD(q5, y5);
     848     3194880 :         _AVX_STORE(&q[4*offset],q5);
     849             : 
     850     6389760 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
     851             : #ifdef __ELPA_USE_FMA__
     852     6389760 :         q1 = _AVX_LOAD(&q[ldq]);
     853     6389760 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
     854     3194880 :         _AVX_STORE(&q[ldq],q1);
     855     6389760 :         q2 = _AVX_LOAD(&q[ldq+offset]);
     856     6389760 :         q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
     857     3194880 :         _AVX_STORE(&q[ldq+offset],q2);
     858     6389760 :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
     859     6389760 :         q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
     860     3194880 :         _AVX_STORE(&q[ldq+2*offset],q3);
     861     6389760 :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
     862     6389760 :         q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
     863     3194880 :         _AVX_STORE(&q[ldq+3*offset],q4);
     864     6389760 :         q5 = _AVX_LOAD(&q[ldq+4*offset]);
     865     6389760 :         q5 = _AVX_ADD(q5, _AVX_FMA(y5, h2, x5));
     866     3194880 :         _AVX_STORE(&q[ldq+4*offset],q5);
     867             : 
     868             : #else
     869             :         q1 = _AVX_LOAD(&q[ldq]);
     870             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
     871             :         _AVX_STORE(&q[ldq],q1);
     872             :         q2 = _AVX_LOAD(&q[ldq+offset]);
     873             :         q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
     874             :         _AVX_STORE(&q[ldq+offset],q2);
     875             :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
     876             :         q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
     877             :         _AVX_STORE(&q[ldq+2*offset],q3);
     878             :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
     879             :         q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
     880             :         _AVX_STORE(&q[ldq+3*offset],q4);
     881             :         q5 = _AVX_LOAD(&q[ldq+4*offset]);
     882             :         q5 = _AVX_ADD(q5, _AVX_ADD(x5, _AVX_MUL(y5, h2)));
     883             :         _AVX_STORE(&q[ldq+4*offset],q5);
     884             : 
     885             : #endif
     886             : 
     887   201277440 :         for (i = 2; i < nb; i++)
     888             :         {
     889   396165120 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
     890   396165120 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
     891             : #ifdef __ELPA_USE_FMA__
     892   396165120 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     893   198082560 :                 q1 = _AVX_FMA(x1, h1, q1);
     894   198082560 :                 q1 = _AVX_FMA(y1, h2, q1);
     895   198082560 :                 _AVX_STORE(&q[i*ldq],q1);
     896   396165120 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     897   198082560 :                 q2 = _AVX_FMA(x2, h1, q2);
     898   198082560 :                 q2 = _AVX_FMA(y2, h2, q2);
     899   198082560 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
     900   396165120 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     901   198082560 :                 q3 = _AVX_FMA(x3, h1, q3);
     902   198082560 :                 q3 = _AVX_FMA(y3, h2, q3);
     903   198082560 :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
     904   396165120 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     905   198082560 :                 q4 = _AVX_FMA(x4, h1, q4);
     906   198082560 :                 q4 = _AVX_FMA(y4, h2, q4);
     907   198082560 :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
     908   396165120 :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     909   198082560 :                 q5 = _AVX_FMA(x5, h1, q5);
     910   198082560 :                 q5 = _AVX_FMA(y5, h2, q5);
     911   198082560 :                 _AVX_STORE(&q[(i*ldq)+4*offset],q5);
     912             : 
     913             : #else
     914             :                 q1 = _AVX_LOAD(&q[i*ldq]);
     915             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
     916             :                 _AVX_STORE(&q[i*ldq],q1);
     917             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     918             :                 q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
     919             :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
     920             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
     921             :                 q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
     922             :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
     923             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
     924             :                 q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
     925             :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
     926             :                 q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
     927             :                 q5 = _AVX_ADD(q5, _AVX_ADD(_AVX_MUL(x5,h1), _AVX_MUL(y5, h2)));
     928             :                 _AVX_STORE(&q[(i*ldq)+4*offset],q5);
     929             : 
     930             : #endif
     931             :         }
     932             : 
     933     6389760 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
     934             : #ifdef __ELPA_USE_FMA__
     935     6389760 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     936     3194880 :         q1 = _AVX_FMA(x1, h1, q1);
     937     3194880 :         _AVX_STORE(&q[nb*ldq],q1);
     938     6389760 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     939     3194880 :         q2 = _AVX_FMA(x2, h1, q2);
     940     3194880 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
     941     6389760 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     942     3194880 :         q3 = _AVX_FMA(x3, h1, q3);
     943     3194880 :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
     944     6389760 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     945     3194880 :         q4 = _AVX_FMA(x4, h1, q4);
     946     3194880 :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
     947     6389760 :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     948     3194880 :         q5 = _AVX_FMA(x5, h1, q5);
     949     3194880 :         _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
     950             : 
     951             : #else
     952             :         q1 = _AVX_LOAD(&q[nb*ldq]);
     953             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
     954             :         _AVX_STORE(&q[nb*ldq],q1);
     955             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     956             :         q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
     957             :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
     958             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
     959             :         q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
     960             :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
     961             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
     962             :         q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
     963             :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
     964             :         q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
     965             :         q5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
     966             :         _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
     967             : 
     968             : #endif
     969             : }
     970             : 
     971             : /**
     972             :  * Unrolled kernel that computes
     973             : #ifdef DOUBLE_PRECISION_REAL
     974             :  * 16 rows of Q simultaneously, a
     975             : #endif
     976             : #ifdef SINGLE_PRECISION_REAL
     977             :  * 32 rows of Q simultaneously, a
     978             : #endif
     979             :  * matrix Vector product with two householder
     980             :  * vectors + a rank 2 update is performed
     981             :  */
     982             : #ifdef DOUBLE_PRECISION_REAL
     983             :  __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     984             : #endif
     985             : #ifdef SINGLE_PRECISION_REAL
     986             :  __forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     987             : #endif
     988             : {
     989             :         /////////////////////////////////////////////////////
     990             :         // Matrix Vector Multiplication, Q [16 x nb+1] * hh
     991             :         // hh contains two householder vectors, with offset 1
     992             :         /////////////////////////////////////////////////////
     993             :         int i;
     994             :         // Needed bit mask for floating point sign flip
     995             : #ifdef DOUBLE_PRECISION_REAL
     996     2007808 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
     997             : #endif
     998             : #ifdef SINGLE_PRECISION_REAL
     999      401088 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
    1000             : #endif
    1001             : 
    1002     4817792 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
    1003     4817792 :         __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
    1004     4817792 :         __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
    1005     4817792 :         __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
    1006             : 
    1007     4817792 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
    1008             :         __AVX_DATATYPE h2;
    1009             : 
    1010             : #ifdef __ELPA_USE_FMA__
    1011     2408896 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1012     2408896 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
    1013     4817792 :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1014     2408896 :         __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
    1015     4817792 :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
    1016     2408896 :         __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
    1017     4817792 :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
    1018     2408896 :         __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
    1019             : 
    1020             : #else
    1021             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1022             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1023             :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1024             :         __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1025             :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
    1026             :         __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
    1027             :         __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
    1028             :         __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
    1029             : 
    1030             : #endif
    1031             : 
    1032   139917888 :         for(i = 2; i < nb; i++)
    1033             :         {
    1034   275017984 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1035   275017984 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1036             : #ifdef __ELPA_USE_FMA__
    1037   275017984 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1038   137508992 :                 x1 = _AVX_FMA(q1, h1, x1);
    1039   137508992 :                 y1 = _AVX_FMA(q1, h2, y1);
    1040   275017984 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1041   137508992 :                 x2 = _AVX_FMA(q2, h1, x2);
    1042   137508992 :                 y2 = _AVX_FMA(q2, h2, y2);
    1043   275017984 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1044   137508992 :                 x3 = _AVX_FMA(q3, h1, x3);
    1045   137508992 :                 y3 = _AVX_FMA(q3, h2, y3);
    1046   275017984 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
    1047   137508992 :                 x4 = _AVX_FMA(q4, h1, x4);
    1048   137508992 :                 y4 = _AVX_FMA(q4, h2, y4);
    1049             : 
    1050             : #else
    1051             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1052             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1053             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1054             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1055             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1056             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
    1057             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1058             :                 x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
    1059             :                 y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
    1060             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
    1061             :                 x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
    1062             :                 y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
    1063             : 
    1064             : #endif
    1065             :         }
    1066             : 
    1067     4817792 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1068             : #ifdef __ELPA_USE_FMA__
    1069     4817792 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1070     2408896 :         x1 = _AVX_FMA(q1, h1, x1);
    1071     4817792 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1072     2408896 :         x2 = _AVX_FMA(q2, h1, x2);
    1073     4817792 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1074     2408896 :         x3 = _AVX_FMA(q3, h1, x3);
    1075     4817792 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
    1076     2408896 :         x4 = _AVX_FMA(q4, h1, x4);
    1077             : 
    1078             : #else
    1079             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1080             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1081             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1082             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1083             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1084             :         x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
    1085             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
    1086             :         x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
    1087             : 
    1088             : #endif
    1089             : 
    1090             :         /////////////////////////////////////////////////////
    1091             :         // Rank-2 update of Q [16 x nb+1]
    1092             :         /////////////////////////////////////////////////////
    1093             : 
    1094     2408896 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
    1095     4817792 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1096     2408896 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
    1097             : 
    1098     2408896 :         h1 = _AVX_XOR(tau1, sign);
    1099     2408896 :         x1 = _AVX_MUL(x1, h1);
    1100     2408896 :         x2 = _AVX_MUL(x2, h1);
    1101     2408896 :         x3 = _AVX_MUL(x3, h1);
    1102     2408896 :         x4 = _AVX_MUL(x4, h1);
    1103     2408896 :         h1 = _AVX_XOR(tau2, sign);
    1104     2408896 :         h2 = _AVX_MUL(h1, vs);
    1105             : #ifdef __ELPA_USE_FMA__
    1106     4817792 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
    1107     4817792 :         y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
    1108     4817792 :         y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
    1109     4817792 :         y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
    1110             : 
    1111             : #else
    1112             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
    1113             :         y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
    1114             :         y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
    1115             :         y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
    1116             : 
    1117             : #endif
    1118             : 
    1119     2408896 :         q1 = _AVX_LOAD(q);
    1120     2408896 :         q1 = _AVX_ADD(q1, y1);
    1121             :         _AVX_STORE(q,q1);
    1122     4817792 :         q2 = _AVX_LOAD(&q[offset]);
    1123     2408896 :         q2 = _AVX_ADD(q2, y2);
    1124     2408896 :         _AVX_STORE(&q[offset],q2);
    1125     4817792 :         q3 = _AVX_LOAD(&q[2*offset]);
    1126     2408896 :         q3 = _AVX_ADD(q3, y3);
    1127     2408896 :         _AVX_STORE(&q[2*offset],q3);
    1128     4817792 :         q4 = _AVX_LOAD(&q[3*offset]);
    1129     2408896 :         q4 = _AVX_ADD(q4, y4);
    1130     2408896 :         _AVX_STORE(&q[3*offset],q4);
    1131             : 
    1132     4817792 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
    1133             : #ifdef __ELPA_USE_FMA__
    1134     4817792 :         q1 = _AVX_LOAD(&q[ldq]);
    1135     4817792 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
    1136     2408896 :         _AVX_STORE(&q[ldq],q1);
    1137     4817792 :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1138     4817792 :         q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
    1139     2408896 :         _AVX_STORE(&q[ldq+offset],q2);
    1140     4817792 :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
    1141     4817792 :         q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
    1142     2408896 :         _AVX_STORE(&q[ldq+2*offset],q3);
    1143     4817792 :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
    1144     4817792 :         q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
    1145     2408896 :         _AVX_STORE(&q[ldq+3*offset],q4);
    1146             : 
    1147             : #else
    1148             :         q1 = _AVX_LOAD(&q[ldq]);
    1149             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
    1150             :         _AVX_STORE(&q[ldq],q1);
    1151             :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1152             :         q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
    1153             :         _AVX_STORE(&q[ldq+offset],q2);
    1154             :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
    1155             :         q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
    1156             :         _AVX_STORE(&q[ldq+2*offset],q3);
    1157             :         q4 = _AVX_LOAD(&q[ldq+3*offset]);
    1158             :         q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
    1159             :         _AVX_STORE(&q[ldq+3*offset],q4);
    1160             : 
    1161             : #endif
    1162             : 
    1163   139917888 :         for (i = 2; i < nb; i++)
    1164             :         {
    1165   275017984 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1166   275017984 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1167             : #ifdef __ELPA_USE_FMA__
    1168   275017984 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1169   137508992 :                 q1 = _AVX_FMA(x1, h1, q1);
    1170   137508992 :                 q1 = _AVX_FMA(y1, h2, q1);
    1171   137508992 :                 _AVX_STORE(&q[i*ldq],q1);
    1172   275017984 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1173   137508992 :                 q2 = _AVX_FMA(x2, h1, q2);
    1174   137508992 :                 q2 = _AVX_FMA(y2, h2, q2);
    1175   137508992 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1176   275017984 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1177   137508992 :                 q3 = _AVX_FMA(x3, h1, q3);
    1178   137508992 :                 q3 = _AVX_FMA(y3, h2, q3);
    1179   137508992 :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
    1180   275017984 :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
    1181   137508992 :                 q4 = _AVX_FMA(x4, h1, q4);
    1182   137508992 :                 q4 = _AVX_FMA(y4, h2, q4);
    1183   137508992 :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
    1184             : 
    1185             : #else
    1186             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1187             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
    1188             :                 _AVX_STORE(&q[i*ldq],q1);
    1189             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1190             :                 q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
    1191             :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1192             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1193             :                 q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
    1194             :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
    1195             :                 q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
    1196             :                 q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
    1197             :                 _AVX_STORE(&q[(i*ldq)+3*offset],q4);
    1198             : 
    1199             : #endif
    1200             :         }
    1201             : 
    1202     4817792 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1203             : #ifdef __ELPA_USE_FMA__
    1204     4817792 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1205     2408896 :         q1 = _AVX_FMA(x1, h1, q1);
    1206     2408896 :         _AVX_STORE(&q[nb*ldq],q1);
    1207     4817792 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1208     2408896 :         q2 = _AVX_FMA(x2, h1, q2);
    1209     2408896 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1210     4817792 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1211     2408896 :         q3 = _AVX_FMA(x3, h1, q3);
    1212     2408896 :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
    1213     4817792 :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
    1214     2408896 :         q4 = _AVX_FMA(x4, h1, q4);
    1215     2408896 :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
    1216             : 
    1217             : #else
    1218             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1219             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1220             :         _AVX_STORE(&q[nb*ldq],q1);
    1221             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1222             :         q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1223             :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1224             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1225             :         q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
    1226             :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
    1227             :         q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
    1228             :         q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
    1229             :         _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
    1230             : 
    1231             : #endif
    1232             : }
    1233             : 
    1234             : 
    1235             : /**
    1236             :  * Unrolled kernel that computes
    1237             : #ifdef DOUBLE_PRECISION_REAL
    1238             :  * 12 rows of Q simultaneously, a
    1239             : #endif
    1240             : #ifdef SINGLE_PRECISION_REAL
    1241             :  * 24 rows of Q simultaneously, a
    1242             : #endif
    1243             :  * matrix Vector product with two householder
    1244             :  * vectors + a rank 2 update is performed
    1245             :  */
    1246             : #ifdef DOUBLE_PRECISION_REAL
    1247             :  __forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
    1248             : #endif
    1249             : #ifdef SINGLE_PRECISION_REAL
    1250             :  __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
    1251             : #endif
    1252             : {
    1253             :         /////////////////////////////////////////////////////
    1254             :         // Matrix Vector Multiplication, Q [16 x nb+1] * hh
    1255             :         // hh contains two householder vectors, with offset 1
    1256             :         /////////////////////////////////////////////////////
    1257             :         int i;
    1258             :         // Needed bit mask for floating point sign flip
    1259             : #ifdef DOUBLE_PRECISION_REAL
    1260           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
    1261             : #endif
    1262             : #ifdef SINGLE_PRECISION_REAL
    1263           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
    1264             : #endif
    1265             : 
    1266           0 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
    1267           0 :         __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
    1268           0 :         __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
    1269             : 
    1270           0 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
    1271             :         __AVX_DATATYPE h2;
    1272             : 
    1273             : #ifdef __ELPA_USE_FMA__
    1274           0 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1275           0 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
    1276           0 :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1277           0 :         __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
    1278           0 :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
    1279           0 :         __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
    1280             : 
    1281             : #else
    1282             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1283             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1284             :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1285             :         __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1286             :         __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
    1287             :         __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
    1288             : #endif
    1289             : 
    1290           0 :         for(i = 2; i < nb; i++)
    1291             :         {
    1292           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1293           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1294             : #ifdef __ELPA_USE_FMA__
    1295           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1296           0 :                 x1 = _AVX_FMA(q1, h1, x1);
    1297           0 :                 y1 = _AVX_FMA(q1, h2, y1);
    1298           0 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1299           0 :                 x2 = _AVX_FMA(q2, h1, x2);
    1300           0 :                 y2 = _AVX_FMA(q2, h2, y2);
    1301           0 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1302           0 :                 x3 = _AVX_FMA(q3, h1, x3);
    1303           0 :                 y3 = _AVX_FMA(q3, h2, y3);
    1304             : 
    1305             : #else
    1306             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1307             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1308             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1309             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1310             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1311             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
    1312             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1313             :                 x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
    1314             :                 y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
    1315             : 
    1316             : #endif
    1317             :         }
    1318             : 
    1319           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1320             : #ifdef __ELPA_USE_FMA__
    1321           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1322           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1323           0 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1324           0 :         x2 = _AVX_FMA(q2, h1, x2);
    1325           0 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1326           0 :         x3 = _AVX_FMA(q3, h1, x3);
    1327             : 
    1328             : #else
    1329             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1330             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1331             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1332             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1333             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1334             :         x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
    1335             : 
    1336             : #endif
    1337             : 
    1338             :         /////////////////////////////////////////////////////
    1339             :         // Rank-2 update of Q [16 x nb+1]
    1340             :         /////////////////////////////////////////////////////
    1341             : 
    1342           0 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
    1343           0 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1344           0 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
    1345             : 
    1346           0 :         h1 = _AVX_XOR(tau1, sign);
    1347           0 :         x1 = _AVX_MUL(x1, h1);
    1348           0 :         x2 = _AVX_MUL(x2, h1);
    1349           0 :         x3 = _AVX_MUL(x3, h1);
    1350           0 :         h1 = _AVX_XOR(tau2, sign);
    1351           0 :         h2 = _AVX_MUL(h1, vs);
    1352             : #ifdef __ELPA_USE_FMA__
    1353           0 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
    1354           0 :         y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
    1355           0 :         y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
    1356             : 
    1357             : #else
    1358             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
    1359             :         y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
    1360             :         y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
    1361             : 
    1362             : #endif
    1363             : 
    1364           0 :         q1 = _AVX_LOAD(q);
    1365           0 :         q1 = _AVX_ADD(q1, y1);
    1366             :         _AVX_STORE(q,q1);
    1367           0 :         q2 = _AVX_LOAD(&q[offset]);
    1368           0 :         q2 = _AVX_ADD(q2, y2);
    1369           0 :         _AVX_STORE(&q[offset],q2);
    1370           0 :         q3 = _AVX_LOAD(&q[2*offset]);
    1371           0 :         q3 = _AVX_ADD(q3, y3);
    1372           0 :         _AVX_STORE(&q[2*offset],q3);
    1373             : 
    1374           0 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
    1375             : #ifdef __ELPA_USE_FMA__
    1376           0 :         q1 = _AVX_LOAD(&q[ldq]);
    1377           0 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
    1378           0 :         _AVX_STORE(&q[ldq],q1);
    1379           0 :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1380           0 :         q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
    1381           0 :         _AVX_STORE(&q[ldq+offset],q2);
    1382           0 :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
    1383           0 :         q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
    1384           0 :         _AVX_STORE(&q[ldq+2*offset],q3);
    1385             : 
    1386             : #else
    1387             :         q1 = _AVX_LOAD(&q[ldq]);
    1388             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
    1389             :         _AVX_STORE(&q[ldq],q1);
    1390             :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1391             :         q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
    1392             :         _AVX_STORE(&q[ldq+offset],q2);
    1393             :         q3 = _AVX_LOAD(&q[ldq+2*offset]);
    1394             :         q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
    1395             :         _AVX_STORE(&q[ldq+2*offset],q3);
    1396             : 
    1397             : #endif
    1398             : 
    1399           0 :         for (i = 2; i < nb; i++)
    1400             :         {
    1401           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1402           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1403             : #ifdef __ELPA_USE_FMA__
    1404           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1405           0 :                 q1 = _AVX_FMA(x1, h1, q1);
    1406           0 :                 q1 = _AVX_FMA(y1, h2, q1);
    1407           0 :                 _AVX_STORE(&q[i*ldq],q1);
    1408           0 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1409           0 :                 q2 = _AVX_FMA(x2, h1, q2);
    1410           0 :                 q2 = _AVX_FMA(y2, h2, q2);
    1411           0 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1412           0 :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1413           0 :                 q3 = _AVX_FMA(x3, h1, q3);
    1414           0 :                 q3 = _AVX_FMA(y3, h2, q3);
    1415           0 :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
    1416             : 
    1417             : #else
    1418             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1419             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
    1420             :                 _AVX_STORE(&q[i*ldq],q1);
    1421             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1422             :                 q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
    1423             :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1424             :                 q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
    1425             :                 q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
    1426             :                 _AVX_STORE(&q[(i*ldq)+2*offset],q3);
    1427             : 
    1428             : #endif
    1429             :         }
    1430             : 
    1431           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1432             : #ifdef __ELPA_USE_FMA__
    1433           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1434           0 :         q1 = _AVX_FMA(x1, h1, q1);
    1435           0 :         _AVX_STORE(&q[nb*ldq],q1);
    1436           0 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1437           0 :         q2 = _AVX_FMA(x2, h1, q2);
    1438           0 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1439           0 :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1440           0 :         q3 = _AVX_FMA(x3, h1, q3);
    1441           0 :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
    1442             : 
    1443             : #else
    1444             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1445             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1446             :         _AVX_STORE(&q[nb*ldq],q1);
    1447             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1448             :         q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1449             :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1450             :         q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
    1451             :         q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
    1452             :         _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
    1453             : 
    1454             : #endif
    1455             : }
    1456             : 
    1457             : /**
    1458             :  * Unrolled kernel that computes
    1459             : #ifdef DOUBLE_PRECISION_REAL
    1460             :  * 8 rows of Q simultaneously, a
    1461             : #endif
    1462             : #ifdef SINGLE_PRECISION_REAL
    1463             :  * 16 rows of Q simultaneously, a
    1464             : #endif
    1465             :  * matrix Vector product with two householder
    1466             :  * vectors + a rank 2 update is performed
    1467             :  */
    1468             : #ifdef DOUBLE_PRECISION_REAL
    1469             :  __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
    1470             : #endif
    1471             : #ifdef SINGLE_PRECISION_REAL
    1472             :  __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
    1473             : #endif
    1474             : {
    1475             :         /////////////////////////////////////////////////////
    1476             :         // Matrix Vector Multiplication, Q [8 x nb+1] * hh
    1477             :         // hh contains two householder vectors, with offset 1
    1478             :         /////////////////////////////////////////////////////
    1479             :         int i;
    1480             :         // Needed bit mask for floating point sign flip
    1481             : #ifdef DOUBLE_PRECISION_REAL
    1482           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
    1483             : #endif
    1484             : #ifdef SINGLE_PRECISION_REAL
    1485           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
    1486             : #endif
    1487             : 
    1488           0 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
    1489           0 :         __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
    1490           0 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
    1491             :         __AVX_DATATYPE h2;
    1492             : 
    1493             : #ifdef __ELPA_USE_FMA__
    1494           0 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1495           0 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
    1496           0 :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1497           0 :         __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
    1498             : 
    1499             : #else
    1500             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1501             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1502             :         __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
    1503             :         __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1504             : 
    1505             : #endif
    1506             : 
    1507           0 :         for(i = 2; i < nb; i++)
    1508             :         {
    1509           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1510           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1511             : #ifdef __ELPA_USE_FMA__
    1512           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1513           0 :                 x1 = _AVX_FMA(q1, h1, x1);
    1514           0 :                 y1 = _AVX_FMA(q1, h2, y1);
    1515           0 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1516           0 :                 x2 = _AVX_FMA(q2, h1, x2);
    1517           0 :                 y2 = _AVX_FMA(q2, h2, y2);
    1518             : 
    1519             : #else
    1520             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1521             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1522             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1523             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1524             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1525             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
    1526             : 
    1527             : #endif
    1528             :         }
    1529             : 
    1530           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1531             : #ifdef __ELPA_USE_FMA__
    1532           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1533           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1534           0 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1535           0 :         x2 = _AVX_FMA(q2, h1, x2);
    1536             : 
    1537             : #else
    1538             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1539             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1540             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1541             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
    1542             : 
    1543             : #endif
    1544             : 
    1545             :         /////////////////////////////////////////////////////
    1546             :         // Rank-2 update of Q [8 x nb+1]
    1547             :         /////////////////////////////////////////////////////
    1548             : 
    1549           0 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
    1550           0 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1551           0 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
    1552             : 
    1553           0 :         h1 = _AVX_XOR(tau1, sign);
    1554           0 :         x1 = _AVX_MUL(x1, h1);
    1555           0 :         x2 = _AVX_MUL(x2, h1);
    1556           0 :         h1 = _AVX_XOR(tau2, sign);
    1557           0 :         h2 = _AVX_MUL(h1, vs);
    1558             : #ifdef __ELPA_USE_FMA__
    1559           0 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
    1560           0 :         y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
    1561             : 
    1562             : #else
    1563             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
    1564             :         y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
    1565             : 
    1566             : #endif
    1567             : 
    1568           0 :         q1 = _AVX_LOAD(q);
    1569           0 :         q1 = _AVX_ADD(q1, y1);
    1570             :         _AVX_STORE(q,q1);
    1571           0 :         q2 = _AVX_LOAD(&q[offset]);
    1572           0 :         q2 = _AVX_ADD(q2, y2);
    1573           0 :         _AVX_STORE(&q[offset],q2);
    1574             : 
    1575           0 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
    1576             : #ifdef __ELPA_USE_FMA__
    1577           0 :         q1 = _AVX_LOAD(&q[ldq]);
    1578           0 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
    1579           0 :         _AVX_STORE(&q[ldq],q1);
    1580           0 :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1581           0 :         q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
    1582           0 :         _AVX_STORE(&q[ldq+offset],q2);
    1583             : 
    1584             : #else
    1585             :         q1 = _AVX_LOAD(&q[ldq]);
    1586             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
    1587             :         _AVX_STORE(&q[ldq],q1);
    1588             :         q2 = _AVX_LOAD(&q[ldq+offset]);
    1589             :         q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
    1590             :         _AVX_STORE(&q[ldq+offset],q2);
    1591             : 
    1592             : #endif
    1593             : 
    1594           0 :         for (i = 2; i < nb; i++)
    1595             :         {
    1596           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1597           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1598             : #ifdef __ELPA_USE_FMA__
    1599           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1600           0 :                 q1 = _AVX_FMA(x1, h1, q1);
    1601           0 :                 q1 = _AVX_FMA(y1, h2, q1);
    1602           0 :                 _AVX_STORE(&q[i*ldq],q1);
    1603           0 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1604           0 :                 q2 = _AVX_FMA(x2, h1, q2);
    1605           0 :                 q2 = _AVX_FMA(y2, h2, q2);
    1606           0 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1607             : 
    1608             : #else
    1609             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1610             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
    1611             :                 _AVX_STORE(&q[i*ldq],q1);
    1612             :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
    1613             :                 q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
    1614             :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
    1615             : 
    1616             : #endif
    1617             :         }
    1618             : 
    1619           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1620             : #ifdef __ELPA_USE_FMA__
    1621           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1622           0 :         q1 = _AVX_FMA(x1, h1, q1);
    1623           0 :         _AVX_STORE(&q[nb*ldq],q1);
    1624           0 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1625           0 :         q2 = _AVX_FMA(x2, h1, q2);
    1626           0 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1627             : 
    1628             : #else
    1629             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1630             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1631             :         _AVX_STORE(&q[nb*ldq],q1);
    1632             :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
    1633             :         q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
    1634             :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1635             : 
    1636             : #endif
    1637             : }
    1638             : 
    1639             : /**
    1640             :  * Unrolled kernel that computes
    1641             : #ifdef DOUBLE_PRECISION_REAL
    1642             :  * 4 rows of Q simultaneously, a
    1643             : #endif
    1644             : #ifdef SINGLE_PRECISION_REAL
    1645             :  * 8 rows of Q simultaneously, a
    1646             : #endif
    1647             : * matrix Vector product with two householder
    1648             :  * vectors + a rank 2 update is performed
    1649             :  */
    1650             : #ifdef DOUBLE_PRECISION_REAL
    1651             :  __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
    1652             : #endif
    1653             : #ifdef SINGLE_PRECISION_REAL
    1654             :  __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
    1655             : #endif
    1656             : {
    1657             :         /////////////////////////////////////////////////////
    1658             :         // Matrix Vector Multiplication, Q [4 x nb+1] * hh
    1659             :         // hh contains two householder vectors, with offset 1
    1660             :         /////////////////////////////////////////////////////
    1661             :         int i;
    1662             :         // Needed bit mask for floating point sign flip
    1663             : #ifdef DOUBLE_PRECISION_REAL
    1664           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
    1665             : #endif
    1666             : #ifdef SINGLE_PRECISION_REAL
    1667           0 :         __m256 sign = (__m256)_mm256_set1_epi32(0x80000000);
    1668             : #endif
    1669             : 
    1670           0 :         __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
    1671           0 :         __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
    1672             :         __AVX_DATATYPE h2;
    1673             : 
    1674             : #ifdef __ELPA_USE_FMA__
    1675             : 
    1676           0 :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1677           0 :         __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
    1678             : #else
    1679             : 
    1680             :         __AVX_DATATYPE q1 = _AVX_LOAD(q);
    1681             :         __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1682             : #endif
    1683             : 
    1684           0 :         for(i = 2; i < nb; i++)
    1685             :         {
    1686           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1687           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1688             : #ifdef __ELPA_USE_FMA__
    1689             : 
    1690           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1691           0 :                 x1 = _AVX_FMA(q1, h1, x1);
    1692           0 :                 y1 = _AVX_FMA(q1, h2, y1);
    1693             : #else
    1694             : 
    1695             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1696             : 
    1697             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1698             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1699             : #endif
    1700             :         }
    1701             : 
    1702           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1703             : #ifdef __ELPA_USE_FMA__
    1704             : 
    1705           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1706           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1707             : #else
    1708             : 
    1709             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1710             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1711             : #endif
    1712             : 
    1713             :         /////////////////////////////////////////////////////
    1714             :         // Rank-2 update of Q [4 x nb+1]
    1715             :         /////////////////////////////////////////////////////
    1716             : 
    1717           0 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
    1718           0 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1719           0 :         __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
    1720             : 
    1721           0 :         h1 = _AVX_XOR(tau1, sign);
    1722           0 :         x1 = _AVX_MUL(x1, h1);
    1723           0 :         h1 = _AVX_XOR(tau2, sign);
    1724           0 :         h2 = _AVX_MUL(h1, vs);
    1725             : #ifdef __ELPA_USE_FMA__
    1726           0 :         y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
    1727             : #else
    1728             :         y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
    1729             : #endif
    1730             : 
    1731           0 :         q1 = _AVX_LOAD(q);
    1732           0 :         q1 = _AVX_ADD(q1, y1);
    1733             :         _AVX_STORE(q,q1);
    1734           0 :         h2 = _AVX_BROADCAST(&hh[ldh+1]);
    1735             : #ifdef __ELPA_USE_FMA__
    1736             : 
    1737           0 :         q1 = _AVX_LOAD(&q[ldq]);
    1738           0 :         q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
    1739             : 
    1740           0 :         _AVX_STORE(&q[ldq],q1);
    1741             : 
    1742             : #else
    1743             : 
    1744             :         q1 = _AVX_LOAD(&q[ldq]);
    1745             :         q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
    1746             :         _AVX_STORE(&q[ldq],q1);
    1747             : 
    1748             : #endif
    1749             : 
    1750           0 :         for (i = 2; i < nb; i++)
    1751             :         {
    1752           0 :                 h1 = _AVX_BROADCAST(&hh[i-1]);
    1753           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i]);
    1754             : #ifdef __ELPA_USE_FMA__
    1755             : 
    1756           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1757           0 :                 q1 = _AVX_FMA(x1, h1, q1);
    1758           0 :                 q1 = _AVX_FMA(y1, h2, q1);
    1759           0 :                 _AVX_STORE(&q[i*ldq],q1);
    1760             : 
    1761             : #else
    1762             :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1763             : 
    1764             :                 q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
    1765             :                 _AVX_STORE(&q[i*ldq],q1);
    1766             : 
    1767             : #endif
    1768             :         }
    1769             : 
    1770           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1771             : #ifdef __ELPA_USE_FMA__
    1772             : 
    1773           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1774           0 :         q1 = _AVX_FMA(x1, h1, q1);
    1775           0 :         _AVX_STORE(&q[nb*ldq],q1);
    1776             : 
    1777             : #else
    1778             : 
    1779             :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1780             :         q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
    1781             :         _AVX_STORE(&q[nb*ldq],q1);
    1782             : #endif
    1783             : }

Generated by: LCOV version 1.12