LCOV - code coverage report
Current view: top level - src/elpa2/kernels - real_avx-avx2_6hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 437 673 64.9 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : //
      46             : // --------------------------------------------------------------------------------------------------
      47             : //
      48             : // This file contains the compute intensive kernels for the Householder transformations.
      49             : // It should be compiled with the highest possible optimization level.
      50             : //
      51             : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
      52             : // On Intel Sandy Bridge use -O3 -mavx
      53             : //
      54             : // Copyright of the original code rests with the authors inside the ELPA
      55             : // consortium. The copyright of any additional modifications shall rest
      56             : // with their original authors, but shall adhere to the licensing terms
      57             : // distributed along with the original code in the file "COPYING".
      58             : //
      59             : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
      60             : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
      61             : // --------------------------------------------------------------------------------------------------
      62             : 
      63             : #include "config-f90.h"
      64             : 
      65             : #include <x86intrin.h>
      66             : #include <stdio.h>
      67             : #include <stdlib.h>
      68             : 
      69             : #define __forceinline __attribute__((always_inline)) static
      70             : 
      71             : #ifdef DOUBLE_PRECISION_REAL
      72             : #define offset 4
      73             : #define __AVX_DATATYPE __m256d
      74             : #define _AVX_LOAD _mm256_load_pd
      75             : #define _AVX_STORE _mm256_store_pd
      76             : #define _AVX_ADD _mm256_add_pd
      77             : #define _AVX_SUB _mm256_sub_pd
      78             : #define _AVX_MUL _mm256_mul_pd
      79             : #define _AVX_BROADCAST _mm256_broadcast_sd
      80             : 
      81             : #ifdef HAVE_AVX2
      82             : 
      83             : #ifdef __FMA4__
      84             : #define __ELPA_USE_FMA__
      85             : #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
      86             : #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
      87             : #define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
      88             : #endif
      89             : 
      90             : #ifdef __AVX2__
      91             : #define __ELPA_USE_FMA__
      92             : #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
      93             : #define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
      94             : #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
      95             : #endif
      96             : 
      97             : #endif
      98             : 
      99             : #define _AVX_FMA _mm256_FMA_pd
     100             : #define _AVX_NFMA _mm256_NFMA_pd
     101             : #define _AVX_FMSUB _mm256_FMSUB_pd
     102             : #endif /* DOUBLE_PRECISION_REAL */
     103             : 
     104             : #ifdef SINGLE_PRECISION_REAL
     105             : #define offset 8
     106             : #define __AVX_DATATYPE __m256
     107             : #define _AVX_LOAD _mm256_load_ps
     108             : #define _AVX_STORE _mm256_store_ps
     109             : #define _AVX_ADD _mm256_add_ps
     110             : #define _AVX_SUB _mm256_sub_ps
     111             : #define _AVX_MUL _mm256_mul_ps
     112             : #define _AVX_BROADCAST _mm256_broadcast_ss
     113             : 
     114             : #ifdef HAVE_AVX2
     115             : 
     116             : #ifdef __FMA4__
     117             : #define __ELPA_USE_FMA__
     118             : #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
     119             : #define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
     120             : #define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
     121             : #endif
     122             : 
     123             : #ifdef __AVX2__
     124             : #define __ELPA_USE_FMA__
     125             : #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
     126             : #define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
     127             : #define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
     128             : #endif
     129             : 
     130             : #endif
     131             : 
     132             : #define _AVX_FMA _mm256_FMA_ps
     133             : #define _AVX_NFMA _mm256_NFMA_ps
     134             : #define _AVX_FMSUB _mm256_FMSUB_ps
     135             : #endif /* SINGLE_PRECISION_REAL */
     136             : 
     137             : #ifdef DOUBLE_PRECISION_REAL
     138             : //Forward declaration
     139             : static void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     140             : static void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     141             : 
     142             : void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     143             : #endif
     144             : 
     145             : #ifdef SINGLE_PRECISION_REAL
     146             : //Forward declaration
     147             : static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     148             : static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     149             : 
     150             : void hexa_hh_trafo_real_avx_avx2_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     151             : 
     152             : #endif
     153             : 
     154             : #ifdef DOUBLE_PRECISION_REAL
     155             : /*
     156             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     157             : !f> interface
     158             : !f>   subroutine hexa_hh_trafo_real_avx_avx2_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
     159             : !f>                             bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_double")
     160             : !f>     use, intrinsic :: iso_c_binding
     161             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     162             : !f>     type(c_ptr), value      :: q
     163             : !f>     real(kind=c_double)     :: hh(pnb,6)
     164             : !f>   end subroutine
     165             : !f> end interface
     166             : !f>#endif
     167             : */
     168             : #endif
     169             : #ifdef SINGLE_PRECISION_REAL
     170             : /*
     171             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     172             : !f> interface
     173             : !f>   subroutine hexa_hh_trafo_real_avx_avx2_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
     174             : !f>                             bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_single")
     175             : !f>     use, intrinsic :: iso_c_binding
     176             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     177             : !f>     type(c_ptr), value      :: q
     178             : !f>     real(kind=c_float)      :: hh(pnb,6)
     179             : !f>   end subroutine
     180             : !f> end interface
     181             : !f>#endif
     182             : */
     183             : #endif
     184             : 
     185             : #ifdef DOUBLE_PRECISION_REAL
     186      327680 : void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     187             : #endif
     188             : #ifdef SINGLE_PRECISION_REAL
     189       61440 : void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     190             : #endif
     191             : {
     192             :         int i;
     193      389120 :         int nb = *pnb;
     194      389120 :         int nq = *pldq;
     195      389120 :         int ldq = *pldq;
     196      389120 :         int ldh = *pldh;
     197             :         int worked_on;
     198             : 
     199      389120 :         worked_on = 0;
     200             : 
     201             :         // calculating scalar products to compute
     202             :         // 6 householder vectors simultaneously
     203             : #ifdef DOUBLE_PRECISION_REAL
     204             :         double scalarprods[15];
     205             : #endif
     206             : #ifdef SINGLE_PRECISION_REAL
     207             :         float scalarprods[15];
     208             : #endif
     209             : 
     210      389120 :         scalarprods[0] = hh[(ldh+1)];
     211      389120 :         scalarprods[1] = hh[(ldh*2)+2];
     212      389120 :         scalarprods[2] = hh[(ldh*2)+1];
     213      389120 :         scalarprods[3] = hh[(ldh*3)+3];
     214      389120 :         scalarprods[4] = hh[(ldh*3)+2];
     215      389120 :         scalarprods[5] = hh[(ldh*3)+1];
     216      389120 :         scalarprods[6] = hh[(ldh*4)+4];
     217      389120 :         scalarprods[7] = hh[(ldh*4)+3];
     218      389120 :         scalarprods[8] = hh[(ldh*4)+2];
     219      389120 :         scalarprods[9] = hh[(ldh*4)+1];
     220      389120 :         scalarprods[10] = hh[(ldh*5)+5];
     221      389120 :         scalarprods[11] = hh[(ldh*5)+4];
     222      389120 :         scalarprods[12] = hh[(ldh*5)+3];
     223      389120 :         scalarprods[13] = hh[(ldh*5)+2];
     224      389120 :         scalarprods[14] = hh[(ldh*5)+1];
     225             : 
     226             :         // calculate scalar product of first and fourth householder Vector
     227             :         // loop counter = 2
     228      389120 :         scalarprods[0] += hh[1] * hh[(2+ldh)];
     229      389120 :         scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
     230      389120 :         scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
     231      389120 :         scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
     232      389120 :         scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
     233             : 
     234             :         // loop counter = 3
     235      389120 :         scalarprods[0] += hh[2] * hh[(3+ldh)];
     236      389120 :         scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
     237      389120 :         scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
     238      389120 :         scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
     239      389120 :         scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
     240             : 
     241      389120 :         scalarprods[1] += hh[1] * hh[3+(ldh*2)];
     242      389120 :         scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
     243      389120 :         scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
     244      389120 :         scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
     245             : 
     246             :         // loop counter = 4
     247      389120 :         scalarprods[0] += hh[3] * hh[(4+ldh)];
     248      389120 :         scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
     249      389120 :         scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
     250      389120 :         scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
     251      389120 :         scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
     252             : 
     253      389120 :         scalarprods[1] += hh[2] * hh[4+(ldh*2)];
     254      389120 :         scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
     255      389120 :         scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
     256      389120 :         scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
     257             : 
     258      389120 :         scalarprods[3] += hh[1] * hh[4+(ldh*3)];
     259      389120 :         scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
     260      389120 :         scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
     261             : 
     262             :         // loop counter = 5
     263      389120 :         scalarprods[0] += hh[4] * hh[(5+ldh)];
     264      389120 :         scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
     265      389120 :         scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
     266      389120 :         scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
     267      389120 :         scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
     268             : 
     269      389120 :         scalarprods[1] += hh[3] * hh[5+(ldh*2)];
     270      389120 :         scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
     271      389120 :         scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
     272      389120 :         scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
     273             : 
     274      389120 :         scalarprods[3] += hh[2] * hh[5+(ldh*3)];
     275      389120 :         scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
     276      389120 :         scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
     277             : 
     278      389120 :         scalarprods[6] += hh[1] * hh[5+(ldh*4)];
     279      389120 :         scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
     280             : 
     281             :         #pragma ivdep
     282    22958080 :         for (i = 6; i < nb; i++)
     283             :         {
     284    22568960 :                 scalarprods[0] += hh[i-1] * hh[(i+ldh)];
     285    22568960 :                 scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
     286    22568960 :                 scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
     287    22568960 :                 scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
     288    22568960 :                 scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
     289             : 
     290    22568960 :                 scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
     291    22568960 :                 scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
     292    22568960 :                 scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
     293    22568960 :                 scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
     294             : 
     295    22568960 :                 scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
     296    22568960 :                 scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
     297    22568960 :                 scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
     298             : 
     299    22568960 :                 scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
     300    22568960 :                 scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
     301             : 
     302    22568960 :                 scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
     303             :         }
     304             : 
     305             :         // Production level kernel calls with padding
     306             : #ifdef DOUBLE_PRECISION_REAL
     307     1966080 :         for (i = 0; i < nq-4; i+=8)
     308             :         {
     309     1638400 :                 hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     310     1638400 :                 worked_on += 8;
     311             :         }
     312             : #endif
     313             : #ifdef SINGLE_PRECISION_REAL
     314      368640 :         for (i = 0; i < nq-8; i+=16)
     315             :         {
     316      307200 :                 hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     317      307200 :                 worked_on += 16;
     318             :         }
     319             : #endif
     320      389120 :         if (nq == i)
     321             :         {
     322      389120 :                 return;
     323             :         }
     324             : #ifdef DOUBLE_PRECISION_REAL
     325           0 :         if (nq-i == 4)
     326             :         {
     327           0 :                 hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     328           0 :                 worked_on += 4;
     329             :         }
     330             : #endif
     331             : #ifdef SINGLE_PRECISION_REAL
     332           0 :         if (nq-i == 8)
     333             :         {
     334           0 :                 hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     335           0 :                 worked_on += 8;
     336             :         }
     337             : #endif
     338             : #ifdef WITH_DEBUG
     339             :         if (worked_on != nq)
     340             :         {
     341             :             printf("Error in real AVX/AVX2 BLOCK6 kernel \n");
     342             :             abort();
     343             :          }
     344             : #endif
     345             : }
     346             : 
     347             : /**
     348             :  * Unrolled kernel that computes
     349             : #ifdef DOUBLE_PRECISION_REAL
     350             :  * 8 rows of Q simultaneously, a
     351             : #endif
     352             : #ifdef SINGLE_PRECISION_REAL
     353             :  * 16 rows of Q simultaneously, a
     354             : #endif
     355             :  * matrix Vector product with two householder
     356             :  * vectors + a rank 1 update is performed
     357             :  */
     358             : #ifdef DOUBLE_PRECISION_REAL
     359             : __forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
     360             : #endif
     361             : #ifdef SINGLE_PRECISION_REAL
     362             : __forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
     363             : #endif
     364             : {
     365             :         /////////////////////////////////////////////////////
     366             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
     367             :         // hh contains four householder vectors
     368             :         /////////////////////////////////////////////////////
     369             :         int i;
     370             : 
     371     3891200 :         __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*5]);
     372     3891200 :         __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*4]);
     373     3891200 :         __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq*3]);
     374     3891200 :         __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[ldq*2]);
     375     3891200 :         __AVX_DATATYPE a5_1 = _AVX_LOAD(&q[ldq]);
     376     1945600 :         __AVX_DATATYPE a6_1 = _AVX_LOAD(&q[0]);
     377             : 
     378     3891200 :         __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
     379     3891200 :         __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
     380     3891200 :         __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
     381     3891200 :         __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
     382     3891200 :         __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
     383             : #ifdef __ELPA_USE_FMA__
     384     1945600 :         register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
     385     1945600 :         t1 = _AVX_FMA(a4_1, h_6_4, t1);
     386     1945600 :         t1 = _AVX_FMA(a3_1, h_6_3, t1);
     387     1945600 :         t1 = _AVX_FMA(a2_1, h_6_2, t1);
     388     1945600 :         t1 = _AVX_FMA(a1_1, h_6_1, t1);
     389             : #else
     390             :         register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
     391             :         t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
     392             :         t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
     393             :         t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
     394             :         t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
     395             : #endif
     396     3891200 :         __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
     397     3891200 :         __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
     398     3891200 :         __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
     399     3891200 :         __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
     400             : #ifdef __ELPA_USE_FMA__
     401     1945600 :         register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
     402     1945600 :         v1 = _AVX_FMA(a3_1, h_5_3, v1);
     403     1945600 :         v1 = _AVX_FMA(a2_1, h_5_2, v1);
     404     1945600 :         v1 = _AVX_FMA(a1_1, h_5_1, v1);
     405             : #else
     406             :         register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
     407             :         v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
     408             :         v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
     409             :         v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
     410             : #endif
     411     3891200 :         __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
     412     3891200 :         __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
     413     3891200 :         __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
     414             : #ifdef __ELPA_USE_FMA__
     415     1945600 :         register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
     416     1945600 :         w1 = _AVX_FMA(a2_1, h_4_2, w1);
     417     1945600 :         w1 = _AVX_FMA(a1_1, h_4_1, w1);
     418             : #else
     419             :         register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
     420             :         w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
     421             :         w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
     422             : #endif
     423     3891200 :         __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
     424     3891200 :         __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
     425     3891200 :         __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
     426             : #ifdef __ELPA_USE_FMA__
     427     1945600 :         register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
     428     1945600 :         z1 = _AVX_FMA(a1_1, h_3_1, z1);
     429     1945600 :         register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
     430             : #else
     431             :         register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
     432             :         z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
     433             :         register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
     434             : #endif
     435     1945600 :         register __AVX_DATATYPE x1 = a1_1;
     436             : 
     437     3891200 :         __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*5)+offset]);
     438     3891200 :         __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*4)+offset]);
     439     3891200 :         __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
     440     3891200 :         __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
     441     3891200 :         __AVX_DATATYPE a5_2 = _AVX_LOAD(&q[(ldq)+offset]);
     442     3891200 :         __AVX_DATATYPE a6_2 = _AVX_LOAD(&q[offset]);
     443             : 
     444             : #ifdef __ELPA_USE_FMA__
     445     1945600 :         register __AVX_DATATYPE t2 = _AVX_FMA(a5_2, h_6_5, a6_2);
     446     1945600 :         t2 = _AVX_FMA(a4_2, h_6_4, t2);
     447     1945600 :         t2 = _AVX_FMA(a3_2, h_6_3, t2);
     448     1945600 :         t2 = _AVX_FMA(a2_2, h_6_2, t2);
     449     1945600 :         t2 = _AVX_FMA(a1_2, h_6_1, t2);
     450     1945600 :         register __AVX_DATATYPE v2 = _AVX_FMA(a4_2, h_5_4, a5_2);
     451     1945600 :         v2 = _AVX_FMA(a3_2, h_5_3, v2);
     452     1945600 :         v2 = _AVX_FMA(a2_2, h_5_2, v2);
     453     1945600 :         v2 = _AVX_FMA(a1_2, h_5_1, v2);
     454     1945600 :         register __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
     455     1945600 :         w2 = _AVX_FMA(a2_2, h_4_2, w2);
     456     1945600 :         w2 = _AVX_FMA(a1_2, h_4_1, w2);
     457     1945600 :         register __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
     458     1945600 :         z2 = _AVX_FMA(a1_2, h_3_1, z2);
     459     1945600 :         register __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
     460             : #else
     461             :         register __AVX_DATATYPE t2 = _AVX_ADD(a6_2, _AVX_MUL(a5_2, h_6_5));
     462             :         t2 = _AVX_ADD(t2, _AVX_MUL(a4_2, h_6_4));
     463             :         t2 = _AVX_ADD(t2, _AVX_MUL(a3_2, h_6_3));
     464             :         t2 = _AVX_ADD(t2, _AVX_MUL(a2_2, h_6_2));
     465             :         t2 = _AVX_ADD(t2, _AVX_MUL(a1_2, h_6_1));
     466             :         register __AVX_DATATYPE v2 = _AVX_ADD(a5_2, _AVX_MUL(a4_2, h_5_4));
     467             :         v2 = _AVX_ADD(v2, _AVX_MUL(a3_2, h_5_3));
     468             :         v2 = _AVX_ADD(v2, _AVX_MUL(a2_2, h_5_2));
     469             :         v2 = _AVX_ADD(v2, _AVX_MUL(a1_2, h_5_1));
     470             :         register __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
     471             :         w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
     472             :         w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
     473             :         register __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
     474             :         z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
     475             :         register __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
     476             : #endif
     477     1945600 :         register __AVX_DATATYPE x2 = a1_2;
     478             : 
     479             :         __AVX_DATATYPE q1;
     480             :         __AVX_DATATYPE q2;
     481             : 
     482             :         __AVX_DATATYPE h1;
     483             :         __AVX_DATATYPE h2;
     484             :         __AVX_DATATYPE h3;
     485             :         __AVX_DATATYPE h4;
     486             :         __AVX_DATATYPE h5;
     487             :         __AVX_DATATYPE h6;
     488             : 
     489   114790400 :         for(i = 6; i < nb; i++)
     490             :         {
     491   225689600 :                 h1 = _AVX_BROADCAST(&hh[i-5]);
     492   225689600 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     493   225689600 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     494             : #ifdef __ELPA_USE_FMA__
     495   112844800 :                 x1 = _AVX_FMA(q1, h1, x1);
     496   112844800 :                 x2 = _AVX_FMA(q2, h1, x2);
     497             : #else
     498             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     499             :                 x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     500             : #endif
     501   225689600 :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
     502             : #ifdef __ELPA_USE_FMA__
     503   112844800 :                 y1 = _AVX_FMA(q1, h2, y1);
     504   112844800 :                 y2 = _AVX_FMA(q2, h2, y2);
     505             : #else
     506             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     507             :                 y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     508             : #endif
     509   225689600 :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
     510             : #ifdef __ELPA_USE_FMA__
     511   112844800 :                 z1 = _AVX_FMA(q1, h3, z1);
     512   112844800 :                 z2 = _AVX_FMA(q2, h3, z2);
     513             : #else
     514             :                 z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
     515             :                 z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
     516             : #endif
     517   225689600 :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
     518             : #ifdef __ELPA_USE_FMA__
     519   112844800 :                 w1 = _AVX_FMA(q1, h4, w1);
     520   112844800 :                 w2 = _AVX_FMA(q2, h4, w2);
     521             : #else
     522             :                 w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
     523             :                 w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
     524             : #endif
     525   225689600 :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
     526             : #ifdef __ELPA_USE_FMA__
     527   112844800 :                 v1 = _AVX_FMA(q1, h5, v1);
     528   112844800 :                 v2 = _AVX_FMA(q2, h5, v2);
     529             : #else
     530             :                 v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
     531             :                 v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
     532             : #endif
     533   225689600 :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
     534             : #ifdef __ELPA_USE_FMA__
     535   112844800 :                 t1 = _AVX_FMA(q1, h6, t1);
     536   112844800 :                 t2 = _AVX_FMA(q2, h6, t2);
     537             : #else
     538             :                 t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
     539             :                 t2 = _AVX_ADD(t2, _AVX_MUL(q2,h6));
     540             : #endif
     541             :         }
     542             : 
     543     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-5]);
     544     3891200 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     545     3891200 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     546             : #ifdef __ELPA_USE_FMA__
     547     1945600 :         x1 = _AVX_FMA(q1, h1, x1);
     548     1945600 :         x2 = _AVX_FMA(q2, h1, x2);
     549             : #else
     550             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     551             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     552             : #endif
     553     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
     554             : #ifdef __ELPA_USE_FMA__
     555     1945600 :         y1 = _AVX_FMA(q1, h2, y1);
     556     1945600 :         y2 = _AVX_FMA(q2, h2, y2);
     557             : #else
     558             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     559             :         y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     560             : #endif
     561     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
     562             : #ifdef __ELPA_USE_FMA__
     563     1945600 :         z1 = _AVX_FMA(q1, h3, z1);
     564     1945600 :         z2 = _AVX_FMA(q2, h3, z2);
     565             : #else
     566             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
     567             :         z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
     568             : #endif
     569     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
     570             : #ifdef __ELPA_USE_FMA__
     571     1945600 :         w1 = _AVX_FMA(q1, h4, w1);
     572     1945600 :         w2 = _AVX_FMA(q2, h4, w2);
     573             : #else
     574             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
     575             :         w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
     576             : #endif
     577     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
     578             : #ifdef __ELPA_USE_FMA__
     579     1945600 :         v1 = _AVX_FMA(q1, h5, v1);
     580     1945600 :         v2 = _AVX_FMA(q2, h5, v2);
     581             : #else
     582             :         v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
     583             :         v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
     584             : #endif
     585             : 
     586     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-4]);
     587     3891200 :         q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
     588     3891200 :         q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
     589             : #ifdef __ELPA_USE_FMA__
     590     1945600 :         x1 = _AVX_FMA(q1, h1, x1);
     591     1945600 :         x2 = _AVX_FMA(q2, h1, x2);
     592             : #else
     593             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     594             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     595             : #endif
     596     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
     597             : #ifdef __ELPA_USE_FMA__
     598     1945600 :         y1 = _AVX_FMA(q1, h2, y1);
     599     1945600 :         y2 = _AVX_FMA(q2, h2, y2);
     600             : #else
     601             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     602             :         y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     603             : #endif
     604     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
     605             : #ifdef __ELPA_USE_FMA__
     606     1945600 :         z1 = _AVX_FMA(q1, h3, z1);
     607     1945600 :         z2 = _AVX_FMA(q2, h3, z2);
     608             : #else
     609             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
     610             :         z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
     611             : #endif
     612     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
     613             : #ifdef __ELPA_USE_FMA__
     614     1945600 :         w1 = _AVX_FMA(q1, h4, w1);
     615     1945600 :         w2 = _AVX_FMA(q2, h4, w2);
     616             : #else
     617             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
     618             :         w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
     619             : #endif
     620             : 
     621     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-3]);
     622     3891200 :         q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
     623     3891200 :         q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
     624             : #ifdef __ELPA_USE_FMA__
     625     1945600 :         x1 = _AVX_FMA(q1, h1, x1);
     626     1945600 :         x2 = _AVX_FMA(q2, h1, x2);
     627             : #else
     628             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     629             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     630             : #endif
     631     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
     632             : #ifdef __ELPA_USE_FMA__
     633     1945600 :         y1 = _AVX_FMA(q1, h2, y1);
     634     1945600 :         y2 = _AVX_FMA(q2, h2, y2);
     635             : #else
     636             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     637             :         y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     638             : #endif
     639     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
     640             : #ifdef __ELPA_USE_FMA__
     641     1945600 :         z1 = _AVX_FMA(q1, h3, z1);
     642     1945600 :         z2 = _AVX_FMA(q2, h3, z2);
     643             : #else
     644             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
     645             :         z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
     646             : #endif
     647             : 
     648     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-2]);
     649     3891200 :         q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
     650     3891200 :         q2 = _AVX_LOAD(&q[((nb+3)*ldq)+offset]);
     651             : #ifdef __ELPA_USE_FMA__
     652     1945600 :         x1 = _AVX_FMA(q1, h1, x1);
     653     1945600 :         x2 = _AVX_FMA(q2, h1, x2);
     654             : #else
     655             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     656             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     657             : #endif
     658     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
     659             : #ifdef __ELPA_USE_FMA__
     660     1945600 :         y1 = _AVX_FMA(q1, h2, y1);
     661     1945600 :         y2 = _AVX_FMA(q2, h2, y2);
     662             : #else
     663             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
     664             :         y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
     665             : #endif
     666             : 
     667     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
     668     3891200 :         q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
     669     3891200 :         q2 = _AVX_LOAD(&q[((nb+4)*ldq)+offset]);
     670             : #ifdef __ELPA_USE_FMA__
     671     1945600 :         x1 = _AVX_FMA(q1, h1, x1);
     672     1945600 :         x2 = _AVX_FMA(q2, h1, x2);
     673             : #else
     674             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
     675             :         x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
     676             : #endif
     677             : 
     678             :         /////////////////////////////////////////////////////
     679             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
     680             :         /////////////////////////////////////////////////////
     681             : 
     682     1945600 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
     683     1945600 :         x1 = _AVX_MUL(x1, tau1);
     684     1945600 :         x2 = _AVX_MUL(x2, tau1);
     685             : 
     686     3891200 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
     687     1945600 :         __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
     688     1945600 :         h2 = _AVX_MUL(tau2, vs_1_2);
     689             : #ifdef __ELPA_USE_FMA__
     690     3891200 :         y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
     691     3891200 :         y2 = _AVX_FMSUB(y2, tau2, _AVX_MUL(x2,h2));
     692             : #else
     693             :         y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
     694             :         y2 = _AVX_SUB(_AVX_MUL(y2,tau2), _AVX_MUL(x2,h2));
     695             : #endif
     696             : 
     697     3891200 :         __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
     698     3891200 :         __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
     699     3891200 :         __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
     700     1945600 :         h2 = _AVX_MUL(tau3, vs_1_3);
     701     1945600 :         h3 = _AVX_MUL(tau3, vs_2_3);
     702             : #ifdef __ELPA_USE_FMA__
     703     5836800 :         z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
     704     5836800 :         z2 = _AVX_FMSUB(z2, tau3, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
     705             : #else
     706             :         z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
     707             :         z2 = _AVX_SUB(_AVX_MUL(z2,tau3), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
     708             : #endif
     709             : 
     710     3891200 :         __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
     711     3891200 :         __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
     712     3891200 :         __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
     713     1945600 :         h2 = _AVX_MUL(tau4, vs_1_4);
     714     1945600 :         h3 = _AVX_MUL(tau4, vs_2_4);
     715     3891200 :         __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
     716     1945600 :         h4 = _AVX_MUL(tau4, vs_3_4);
     717             : #ifdef __ELPA_USE_FMA__
     718     7782400 :         w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
     719     7782400 :         w2 = _AVX_FMSUB(w2, tau4, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
     720             : #else
     721             :         w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
     722             :         w2 = _AVX_SUB(_AVX_MUL(w2,tau4), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
     723             : #endif
     724             : 
     725     3891200 :         __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
     726     3891200 :         __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
     727     3891200 :         __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
     728     1945600 :         h2 = _AVX_MUL(tau5, vs_1_5);
     729     1945600 :         h3 = _AVX_MUL(tau5, vs_2_5);
     730     3891200 :         __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
     731     3891200 :         __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
     732     1945600 :         h4 = _AVX_MUL(tau5, vs_3_5);
     733     1945600 :         h5 = _AVX_MUL(tau5, vs_4_5);
     734             : #ifdef __ELPA_USE_FMA__
     735    11673600 :         v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
     736    11673600 :         v2 = _AVX_FMSUB(v2, tau5, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
     737             : #else
     738             :         v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
     739             :         v2 = _AVX_SUB(_AVX_MUL(v2,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
     740             : #endif
     741             : 
     742     3891200 :         __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
     743     3891200 :         __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
     744     3891200 :         __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
     745     1945600 :         h2 = _AVX_MUL(tau6, vs_1_6);
     746     1945600 :         h3 = _AVX_MUL(tau6, vs_2_6);
     747     3891200 :         __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
     748     3891200 :         __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
     749     3891200 :         __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
     750     1945600 :         h4 = _AVX_MUL(tau6, vs_3_6);
     751     1945600 :         h5 = _AVX_MUL(tau6, vs_4_6);
     752     1945600 :         h6 = _AVX_MUL(tau6, vs_5_6);
     753             : #ifdef __ELPA_USE_FMA__
     754    13619200 :         t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
     755    13619200 :         t2 = _AVX_FMSUB(t2, tau6, _AVX_FMA(v2, h6, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)))));
     756             : #else
     757             :         t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
     758             :         t2 = _AVX_SUB(_AVX_MUL(t2,tau6), _AVX_ADD( _AVX_MUL(v2,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)))));
     759             : #endif
     760             : 
     761             :         /////////////////////////////////////////////////////
     762             :         // Rank-1 update of Q [8 x nb+3]
     763             :         /////////////////////////////////////////////////////
     764             : 
     765     1945600 :         q1 = _AVX_LOAD(&q[0]);
     766     3891200 :         q2 = _AVX_LOAD(&q[offset]);
     767     1945600 :         q1 = _AVX_SUB(q1, t1);
     768     1945600 :         q2 = _AVX_SUB(q2, t2);
     769             :         _AVX_STORE(&q[0],q1);
     770     1945600 :         _AVX_STORE(&q[offset],q2);
     771             : 
     772     3891200 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
     773     3891200 :         q1 = _AVX_LOAD(&q[ldq]);
     774     3891200 :         q2 = _AVX_LOAD(&q[(ldq+offset)]);
     775     1945600 :         q1 = _AVX_SUB(q1, v1);
     776     1945600 :         q2 = _AVX_SUB(q2, v2);
     777             : #ifdef __ELPA_USE_FMA__
     778     1945600 :         q1 = _AVX_NFMA(t1, h6, q1);
     779     1945600 :         q2 = _AVX_NFMA(t2, h6, q2);
     780             : #else
     781             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     782             :         q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     783             : #endif
     784     1945600 :         _AVX_STORE(&q[ldq],q1);
     785     1945600 :         _AVX_STORE(&q[(ldq+offset)],q2);
     786             : 
     787     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
     788     3891200 :         q1 = _AVX_LOAD(&q[ldq*2]);
     789     3891200 :         q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
     790     1945600 :         q1 = _AVX_SUB(q1, w1);
     791     1945600 :         q2 = _AVX_SUB(q2, w2);
     792             : #ifdef __ELPA_USE_FMA__
     793     1945600 :         q1 = _AVX_NFMA(v1, h5, q1);
     794     1945600 :         q2 = _AVX_NFMA(v2, h5, q2);
     795             : #else
     796             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
     797             :         q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
     798             : #endif
     799     3891200 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
     800             : #ifdef __ELPA_USE_FMA__
     801     1945600 :         q1 = _AVX_NFMA(t1, h6, q1);
     802     1945600 :         q2 = _AVX_NFMA(t2, h6, q2);
     803             : #else
     804             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     805             :         q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     806             : #endif
     807     1945600 :         _AVX_STORE(&q[ldq*2],q1);
     808     1945600 :         _AVX_STORE(&q[(ldq*2)+offset],q2);
     809             : 
     810     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
     811     3891200 :         q1 = _AVX_LOAD(&q[ldq*3]);
     812     3891200 :         q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
     813     1945600 :         q1 = _AVX_SUB(q1, z1);
     814     1945600 :         q2 = _AVX_SUB(q2, z2);
     815             : #ifdef __ELPA_USE_FMA__
     816     1945600 :         q1 = _AVX_NFMA(w1, h4, q1);
     817     1945600 :         q2 = _AVX_NFMA(w2, h4, q2);
     818             : #else
     819             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
     820             :         q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
     821             : #endif
     822     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
     823             : #ifdef __ELPA_USE_FMA__
     824     1945600 :         q1 = _AVX_NFMA(v1, h5, q1);
     825     1945600 :         q2 = _AVX_NFMA(v2, h5, q2);
     826             : #else
     827             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
     828             :         q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
     829             : #endif
     830     3891200 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
     831             : #ifdef __ELPA_USE_FMA__
     832     1945600 :         q1 = _AVX_NFMA(t1, h6, q1);
     833     1945600 :         q2 = _AVX_NFMA(t2, h6, q2);
     834             : #else
     835             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     836             :         q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     837             : #endif
     838     1945600 :         _AVX_STORE(&q[ldq*3],q1);
     839     1945600 :         _AVX_STORE(&q[(ldq*3)+offset],q2);
     840             : 
     841     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
     842     3891200 :         q1 = _AVX_LOAD(&q[ldq*4]);
     843     3891200 :         q2 = _AVX_LOAD(&q[(ldq*4)+offset]);
     844     1945600 :         q1 = _AVX_SUB(q1, y1);
     845     1945600 :         q2 = _AVX_SUB(q2, y2);
     846             : #ifdef __ELPA_USE_FMA__
     847     1945600 :         q1 = _AVX_NFMA(z1, h3, q1);
     848     1945600 :         q2 = _AVX_NFMA(z2, h3, q2);
     849             : #else
     850             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
     851             :         q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
     852             : #endif
     853     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
     854             : #ifdef __ELPA_USE_FMA__
     855     1945600 :         q1 = _AVX_NFMA(w1, h4, q1);
     856     1945600 :         q2 = _AVX_NFMA(w2, h4, q2);
     857             : #else
     858             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
     859             :         q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
     860             : #endif
     861     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
     862             : #ifdef __ELPA_USE_FMA__
     863     1945600 :         q1 = _AVX_NFMA(v1, h5, q1);
     864     1945600 :         q2 = _AVX_NFMA(v2, h5, q2);
     865             : #else
     866             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
     867             :         q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
     868             : #endif
     869     3891200 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
     870             : #ifdef __ELPA_USE_FMA__
     871     1945600 :         q1 = _AVX_NFMA(t1, h6, q1);
     872     1945600 :         q2 = _AVX_NFMA(t2, h6, q2);
     873             : #else
     874             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     875             :         q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     876             : #endif
     877     1945600 :         _AVX_STORE(&q[ldq*4],q1);
     878     1945600 :         _AVX_STORE(&q[(ldq*4)+offset],q2);
     879             : 
     880     3891200 :         h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
     881     3891200 :         q1 = _AVX_LOAD(&q[ldq*5]);
     882     3891200 :         q2 = _AVX_LOAD(&q[(ldq*5)+offset]);
     883     1945600 :         q1 = _AVX_SUB(q1, x1);
     884     1945600 :         q2 = _AVX_SUB(q2, x2);
     885             : #ifdef __ELPA_USE_FMA__
     886     1945600 :         q1 = _AVX_NFMA(y1, h2, q1);
     887     1945600 :         q2 = _AVX_NFMA(y2, h2, q2);
     888             : #else
     889             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
     890             :         q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
     891             : #endif
     892     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
     893             : #ifdef __ELPA_USE_FMA__
     894     1945600 :         q1 = _AVX_NFMA(z1, h3, q1);
     895     1945600 :         q2 = _AVX_NFMA(z2, h3, q2);
     896             : #else
     897             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
     898             :         q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
     899             : #endif
     900     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
     901             : #ifdef __ELPA_USE_FMA__
     902     1945600 :         q1 = _AVX_NFMA(w1, h4, q1);
     903     1945600 :         q2 = _AVX_NFMA(w2, h4, q2);
     904             : #else
     905             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
     906             :         q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
     907             : #endif
     908     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
     909             : #ifdef __ELPA_USE_FMA__
     910     1945600 :         q1 = _AVX_NFMA(v1, h5, q1);
     911     1945600 :         q2 = _AVX_NFMA(v2, h5, q2);
     912             : #else
     913             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
     914             :         q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
     915             : #endif
     916     3891200 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
     917             : #ifdef __ELPA_USE_FMA__
     918     1945600 :         q1 = _AVX_NFMA(t1, h6, q1);
     919     1945600 :         q2 = _AVX_NFMA(t2, h6, q2);
     920             : #else
     921             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     922             :         q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     923             : #endif
     924     1945600 :         _AVX_STORE(&q[ldq*5],q1);
     925     1945600 :         _AVX_STORE(&q[(ldq*5)+offset],q2);
     926             : 
     927   114790400 :         for (i = 6; i < nb; i++)
     928             :         {
     929   225689600 :                 q1 = _AVX_LOAD(&q[i*ldq]);
     930   225689600 :                 q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
     931   225689600 :                 h1 = _AVX_BROADCAST(&hh[i-5]);
     932             : #ifdef __ELPA_USE_FMA__
     933   112844800 :                 q1 = _AVX_NFMA(x1, h1, q1);
     934   112844800 :                 q2 = _AVX_NFMA(x2, h1, q2);
     935             : #else
     936             :                 q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
     937             :                 q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
     938             : #endif
     939   225689600 :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
     940             : #ifdef __ELPA_USE_FMA__
     941   112844800 :                 q1 = _AVX_NFMA(y1, h2, q1);
     942   112844800 :                 q2 = _AVX_NFMA(y2, h2, q2);
     943             : #else
     944             :                 q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
     945             :                 q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
     946             : #endif
     947   225689600 :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
     948             : #ifdef __ELPA_USE_FMA__
     949   112844800 :                 q1 = _AVX_NFMA(z1, h3, q1);
     950   112844800 :                 q2 = _AVX_NFMA(z2, h3, q2);
     951             : #else
     952             :                 q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
     953             :                 q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
     954             : #endif
     955   225689600 :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
     956             : #ifdef __ELPA_USE_FMA__
     957   112844800 :                 q1 = _AVX_NFMA(w1, h4, q1);
     958   112844800 :                 q2 = _AVX_NFMA(w2, h4, q2);
     959             : #else
     960             :                 q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
     961             :                 q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
     962             : #endif
     963   225689600 :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
     964             : #ifdef __ELPA_USE_FMA__
     965   112844800 :                 q1 = _AVX_NFMA(v1, h5, q1);
     966   112844800 :                 q2 = _AVX_NFMA(v2, h5, q2);
     967             : #else
     968             :                 q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
     969             :                 q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
     970             : #endif
     971   225689600 :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
     972             : #ifdef __ELPA_USE_FMA__
     973   112844800 :                 q1 = _AVX_NFMA(t1, h6, q1);
     974   112844800 :                 q2 = _AVX_NFMA(t2, h6, q2);
     975             : #else
     976             :                 q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
     977             :                 q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
     978             : #endif
     979   112844800 :                 _AVX_STORE(&q[i*ldq],q1);
     980   112844800 :                 _AVX_STORE(&q[(i*ldq)+offset],q2);
     981             :         }
     982             : 
     983     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-5]);
     984     3891200 :         q1 = _AVX_LOAD(&q[nb*ldq]);
     985     3891200 :         q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
     986             : #ifdef __ELPA_USE_FMA__
     987     1945600 :         q1 = _AVX_NFMA(x1, h1, q1);
     988     1945600 :         q2 = _AVX_NFMA(x2, h1, q2);
     989             : #else
     990             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
     991             :         q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
     992             : #endif
     993     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
     994             : #ifdef __ELPA_USE_FMA__
     995     1945600 :         q1 = _AVX_NFMA(y1, h2, q1);
     996     1945600 :         q2 = _AVX_NFMA(y2, h2, q2);
     997             : #else
     998             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
     999             :         q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
    1000             : #endif
    1001     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
    1002             : #ifdef __ELPA_USE_FMA__
    1003     1945600 :         q1 = _AVX_NFMA(z1, h3, q1);
    1004     1945600 :         q2 = _AVX_NFMA(z2, h3, q2);
    1005             : #else
    1006             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1007             :         q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
    1008             : #endif
    1009     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
    1010             : #ifdef __ELPA_USE_FMA__
    1011     1945600 :         q1 = _AVX_NFMA(w1, h4, q1);
    1012     1945600 :         q2 = _AVX_NFMA(w2, h4, q2);
    1013             : #else
    1014             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1015             :         q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
    1016             : #endif
    1017     3891200 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
    1018             : #ifdef __ELPA_USE_FMA__
    1019     1945600 :         q1 = _AVX_NFMA(v1, h5, q1);
    1020     1945600 :         q2 = _AVX_NFMA(v2, h5, q2);
    1021             : #else
    1022             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1023             :         q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
    1024             : #endif
    1025     1945600 :         _AVX_STORE(&q[nb*ldq],q1);
    1026     1945600 :         _AVX_STORE(&q[(nb*ldq)+offset],q2);
    1027             : 
    1028     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-4]);
    1029     3891200 :         q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
    1030     3891200 :         q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
    1031             : #ifdef __ELPA_USE_FMA__
    1032     1945600 :         q1 = _AVX_NFMA(x1, h1, q1);
    1033     1945600 :         q2 = _AVX_NFMA(x2, h1, q2);
    1034             : #else
    1035             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1036             :         q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
    1037             : #endif
    1038     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
    1039             : #ifdef __ELPA_USE_FMA__
    1040     1945600 :         q1 = _AVX_NFMA(y1, h2, q1);
    1041     1945600 :         q2 = _AVX_NFMA(y2, h2, q2);
    1042             : #else
    1043             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1044             :         q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
    1045             : #endif
    1046     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
    1047             : #ifdef __ELPA_USE_FMA__
    1048     1945600 :         q1 = _AVX_NFMA(z1, h3, q1);
    1049     1945600 :         q2 = _AVX_NFMA(z2, h3, q2);
    1050             : #else
    1051             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1052             :         q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
    1053             : #endif
    1054     3891200 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
    1055             : #ifdef __ELPA_USE_FMA__
    1056     1945600 :         q1 = _AVX_NFMA(w1, h4, q1);
    1057     1945600 :         q2 = _AVX_NFMA(w2, h4, q2);
    1058             : #else
    1059             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1060             :         q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
    1061             : #endif
    1062     1945600 :         _AVX_STORE(&q[(nb+1)*ldq],q1);
    1063     1945600 :         _AVX_STORE(&q[((nb+1)*ldq)+offset],q2);
    1064             : 
    1065     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-3]);
    1066     3891200 :         q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
    1067     3891200 :         q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
    1068             : #ifdef __ELPA_USE_FMA__
    1069     1945600 :         q1 = _AVX_NFMA(x1, h1, q1);
    1070     1945600 :         q2 = _AVX_NFMA(x2, h1, q2);
    1071             : #else
    1072             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1073             :         q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
    1074             : #endif
    1075     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
    1076             : #ifdef __ELPA_USE_FMA__
    1077     1945600 :         q1 = _AVX_NFMA(y1, h2, q1);
    1078     1945600 :         q2 = _AVX_NFMA(y2, h2, q2);
    1079             : #else
    1080             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1081             :         q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
    1082             : #endif
    1083     3891200 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
    1084             : #ifdef __ELPA_USE_FMA__
    1085     1945600 :         q1 = _AVX_NFMA(z1, h3, q1);
    1086     1945600 :         q2 = _AVX_NFMA(z2, h3, q2);
    1087             : #else
    1088             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1089             :         q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
    1090             : #endif
    1091     1945600 :         _AVX_STORE(&q[(nb+2)*ldq],q1);
    1092     1945600 :         _AVX_STORE(&q[((nb+2)*ldq)+offset],q2);
    1093             : 
    1094     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-2]);
    1095     3891200 :         q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
    1096     3891200 :         q2 = _AVX_LOAD(&q[((nb+3)*ldq)+offset]);
    1097             : #ifdef __ELPA_USE_FMA__
    1098     1945600 :         q1 = _AVX_NFMA(x1, h1, q1);
    1099     1945600 :         q2 = _AVX_NFMA(x2, h1, q2);
    1100             : #else
    1101             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1102             :         q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
    1103             : #endif
    1104     3891200 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
    1105             : #ifdef __ELPA_USE_FMA__
    1106     1945600 :         q1 = _AVX_NFMA(y1, h2, q1);
    1107     1945600 :         q2 = _AVX_NFMA(y2, h2, q2);
    1108             : #else
    1109             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1110             :         q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
    1111             : #endif
    1112     1945600 :         _AVX_STORE(&q[(nb+3)*ldq],q1);
    1113     1945600 :         _AVX_STORE(&q[((nb+3)*ldq)+offset],q2);
    1114             : 
    1115     3891200 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1116     3891200 :         q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
    1117     3891200 :         q2 = _AVX_LOAD(&q[((nb+4)*ldq)+offset]);
    1118             : #ifdef __ELPA_USE_FMA__
    1119     1945600 :         q1 = _AVX_NFMA(x1, h1, q1);
    1120     1945600 :         q2 = _AVX_NFMA(x2, h1, q2);
    1121             : #else
    1122             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1123             :         q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
    1124             : #endif
    1125     1945600 :         _AVX_STORE(&q[(nb+4)*ldq],q1);
    1126     1945600 :         _AVX_STORE(&q[((nb+4)*ldq)+offset],q2);
    1127             : }
    1128             : 
    1129             : /**
    1130             :  * Unrolled kernel that computes
    1131             : #ifdef DOUBLE_PRECISION_REAL
    1132             :  * 4 rows of Q simultaneously, a
    1133             : #endif
    1134             : #ifdef SINGLE_PRECISION_REAL
    1135             :  * 8 rows of Q simultaneously, a
    1136             : #endif
    1137             : 
    1138             :  * matrix Vector product with two householder
    1139             :  * vectors + a rank 1 update is performed
    1140             :  */
    1141             : #ifdef DOUBLE_PRECISION_REAL
    1142             : __forceinline void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
    1143             : #endif
    1144             : #ifdef SINGLE_PRECISION_REAL
    1145             : __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
    1146             : #endif
    1147             : {
    1148             :         /////////////////////////////////////////////////////
    1149             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
    1150             :         // hh contains four householder vectors
    1151             :         /////////////////////////////////////////////////////
    1152             :         int i;
    1153             : 
    1154           0 :         __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*5]);
    1155           0 :         __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*4]);
    1156           0 :         __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq*3]);
    1157           0 :         __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[ldq*2]);
    1158           0 :         __AVX_DATATYPE a5_1 = _AVX_LOAD(&q[ldq]);
    1159           0 :         __AVX_DATATYPE a6_1 = _AVX_LOAD(&q[0]);
    1160             : 
    1161           0 :         __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
    1162           0 :         __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
    1163           0 :         __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
    1164           0 :         __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
    1165           0 :         __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
    1166             : #ifdef __ELPA_USE_FMA__
    1167           0 :         register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
    1168           0 :         t1 = _AVX_FMA(a4_1, h_6_4, t1);
    1169           0 :         t1 = _AVX_FMA(a3_1, h_6_3, t1);
    1170           0 :         t1 = _AVX_FMA(a2_1, h_6_2, t1);
    1171           0 :         t1 = _AVX_FMA(a1_1, h_6_1, t1);
    1172             : #else
    1173             :         register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
    1174             :         t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
    1175             :         t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
    1176             :         t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
    1177             :         t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
    1178             : #endif
    1179           0 :         __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
    1180           0 :         __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
    1181           0 :         __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
    1182           0 :         __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
    1183             : #ifdef __ELPA_USE_FMA__
    1184           0 :         register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
    1185           0 :         v1 = _AVX_FMA(a3_1, h_5_3, v1);
    1186           0 :         v1 = _AVX_FMA(a2_1, h_5_2, v1);
    1187           0 :         v1 = _AVX_FMA(a1_1, h_5_1, v1);
    1188             : #else
    1189             :         register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
    1190             :         v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
    1191             :         v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
    1192             :         v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
    1193             : #endif
    1194           0 :         __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
    1195           0 :         __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
    1196           0 :         __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
    1197             : #ifdef __ELPA_USE_FMA__
    1198           0 :         register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
    1199           0 :         w1 = _AVX_FMA(a2_1, h_4_2, w1);
    1200           0 :         w1 = _AVX_FMA(a1_1, h_4_1, w1);
    1201             : #else
    1202             :         register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
    1203             :         w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
    1204             :         w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
    1205             : #endif
    1206           0 :         __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
    1207           0 :         __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
    1208           0 :         __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
    1209             : #ifdef __ELPA_USE_FMA__
    1210           0 :         register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
    1211           0 :         z1 = _AVX_FMA(a1_1, h_3_1, z1);
    1212           0 :         register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
    1213             : #else
    1214             :         register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
    1215             :         z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
    1216             :         register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
    1217             : #endif
    1218           0 :         register __AVX_DATATYPE x1 = a1_1;
    1219             : 
    1220             :         __AVX_DATATYPE q1;
    1221             : 
    1222             :         __AVX_DATATYPE h1;
    1223             :         __AVX_DATATYPE h2;
    1224             :         __AVX_DATATYPE h3;
    1225             :         __AVX_DATATYPE h4;
    1226             :         __AVX_DATATYPE h5;
    1227             :         __AVX_DATATYPE h6;
    1228             : 
    1229           0 :         for(i = 6; i < nb; i++)
    1230             :         {
    1231           0 :                 h1 = _AVX_BROADCAST(&hh[i-5]);
    1232           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1233             : #ifdef __ELPA_USE_FMA__
    1234           0 :                 x1 = _AVX_FMA(q1, h1, x1);
    1235             : #else
    1236             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1237             : #endif
    1238           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
    1239             : #ifdef __ELPA_USE_FMA__
    1240           0 :                 y1 = _AVX_FMA(q1, h2, y1);
    1241             : #else
    1242             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1243             : #endif
    1244           0 :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
    1245             : #ifdef __ELPA_USE_FMA__
    1246           0 :                 z1 = _AVX_FMA(q1, h3, z1);
    1247             : #else
    1248             :                 z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1249             : #endif
    1250           0 :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
    1251             : #ifdef __ELPA_USE_FMA__
    1252           0 :                 w1 = _AVX_FMA(q1, h4, w1);
    1253             : #else
    1254             :                 w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1255             : #endif
    1256           0 :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
    1257             : #ifdef __ELPA_USE_FMA__
    1258           0 :                 v1 = _AVX_FMA(q1, h5, v1);
    1259             : #else
    1260             :                 v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
    1261             : #endif
    1262           0 :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
    1263             : #ifdef __ELPA_USE_FMA__
    1264           0 :                 t1 = _AVX_FMA(q1, h6, t1);
    1265             : #else
    1266             :                 t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
    1267             : #endif
    1268             :         }
    1269             : 
    1270           0 :         h1 = _AVX_BROADCAST(&hh[nb-5]);
    1271           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1272             : #ifdef __ELPA_USE_FMA__
    1273           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1274             : #else
    1275             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1276             : #endif
    1277           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
    1278             : #ifdef __ELPA_USE_FMA__
    1279           0 :         y1 = _AVX_FMA(q1, h2, y1);
    1280             : #else
    1281             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1282             : #endif
    1283           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
    1284             : #ifdef __ELPA_USE_FMA__
    1285           0 :         z1 = _AVX_FMA(q1, h3, z1);
    1286             : #else
    1287             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1288             : #endif
    1289           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
    1290             : #ifdef __ELPA_USE_FMA__
    1291           0 :         w1 = _AVX_FMA(q1, h4, w1);
    1292             : #else
    1293             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1294             : #endif
    1295           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
    1296             : #ifdef __ELPA_USE_FMA__
    1297           0 :         v1 = _AVX_FMA(q1, h5, v1);
    1298             : #else
    1299             :         v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
    1300             : #endif
    1301             : 
    1302           0 :         h1 = _AVX_BROADCAST(&hh[nb-4]);
    1303           0 :         q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
    1304             : #ifdef __ELPA_USE_FMA__
    1305           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1306             : #else
    1307             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1308             : #endif
    1309           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
    1310             : #ifdef __ELPA_USE_FMA__
    1311           0 :         y1 = _AVX_FMA(q1, h2, y1);
    1312             : #else
    1313             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1314             : #endif
    1315           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
    1316             : #ifdef __ELPA_USE_FMA__
    1317           0 :         z1 = _AVX_FMA(q1, h3, z1);
    1318             : #else
    1319             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1320             : #endif
    1321           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
    1322             : #ifdef __ELPA_USE_FMA__
    1323           0 :         w1 = _AVX_FMA(q1, h4, w1);
    1324             : #else
    1325             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1326             : #endif
    1327             : 
    1328           0 :         h1 = _AVX_BROADCAST(&hh[nb-3]);
    1329           0 :         q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
    1330             : #ifdef __ELPA_USE_FMA__
    1331           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1332             : #else
    1333             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1334             : #endif
    1335           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
    1336             : #ifdef __ELPA_USE_FMA__
    1337           0 :         y1 = _AVX_FMA(q1, h2, y1);
    1338             : #else
    1339             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1340             : #endif
    1341           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
    1342             : #ifdef __ELPA_USE_FMA__
    1343           0 :         z1 = _AVX_FMA(q1, h3, z1);
    1344             : #else
    1345             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1346             : #endif
    1347             : 
    1348           0 :         h1 = _AVX_BROADCAST(&hh[nb-2]);
    1349           0 :         q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
    1350             : #ifdef __ELPA_USE_FMA__
    1351           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1352             : #else
    1353             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1354             : #endif
    1355           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
    1356             : #ifdef __ELPA_USE_FMA__
    1357           0 :         y1 = _AVX_FMA(q1, h2, y1);
    1358             : #else
    1359             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1360             : #endif
    1361             : 
    1362           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1363           0 :         q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
    1364             : #ifdef __ELPA_USE_FMA__
    1365           0 :         x1 = _AVX_FMA(q1, h1, x1);
    1366             : #else
    1367             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1368             : #endif
    1369             : 
    1370             :         /////////////////////////////////////////////////////
    1371             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
    1372             :         /////////////////////////////////////////////////////
    1373             : 
    1374           0 :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
    1375           0 :         x1 = _AVX_MUL(x1, tau1);
    1376             : 
    1377           0 :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1378           0 :         __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
    1379           0 :         h2 = _AVX_MUL(tau2, vs_1_2);
    1380             : #ifdef __ELPA_USE_FMA__
    1381           0 :         y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
    1382             : #else
    1383             :         y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
    1384             : #endif
    1385             : 
    1386           0 :         __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
    1387           0 :         __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
    1388           0 :         __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
    1389           0 :         h2 = _AVX_MUL(tau3, vs_1_3);
    1390           0 :         h3 = _AVX_MUL(tau3, vs_2_3);
    1391             : #ifdef __ELPA_USE_FMA__
    1392           0 :         z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
    1393             : #else
    1394             :         z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
    1395             : #endif
    1396             : 
    1397           0 :         __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
    1398           0 :         __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
    1399           0 :         __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
    1400           0 :         h2 = _AVX_MUL(tau4, vs_1_4);
    1401           0 :         h3 = _AVX_MUL(tau4, vs_2_4);
    1402           0 :         __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
    1403           0 :         h4 = _AVX_MUL(tau4, vs_3_4);
    1404             : #ifdef __ELPA_USE_FMA__
    1405           0 :         w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
    1406             : #else
    1407             :         w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
    1408             : #endif
    1409             : 
    1410           0 :         __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
    1411           0 :         __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
    1412           0 :         __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
    1413           0 :         h2 = _AVX_MUL(tau5, vs_1_5);
    1414           0 :         h3 = _AVX_MUL(tau5, vs_2_5);
    1415           0 :         __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
    1416           0 :         __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
    1417           0 :         h4 = _AVX_MUL(tau5, vs_3_5);
    1418           0 :         h5 = _AVX_MUL(tau5, vs_4_5);
    1419             : #ifdef __ELPA_USE_FMA__
    1420           0 :         v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
    1421             : #else
    1422             :         v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
    1423             : #endif
    1424             : 
    1425           0 :         __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
    1426           0 :         __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
    1427           0 :         __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
    1428           0 :         h2 = _AVX_MUL(tau6, vs_1_6);
    1429           0 :         h3 = _AVX_MUL(tau6, vs_2_6);
    1430           0 :         __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
    1431           0 :         __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
    1432           0 :         __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
    1433           0 :         h4 = _AVX_MUL(tau6, vs_3_6);
    1434           0 :         h5 = _AVX_MUL(tau6, vs_4_6);
    1435           0 :         h6 = _AVX_MUL(tau6, vs_5_6);
    1436             : #ifdef __ELPA_USE_FMA__
    1437           0 :         t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
    1438             : #else
    1439             :         t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
    1440             : #endif
    1441             : 
    1442             :         /////////////////////////////////////////////////////
    1443             :         // Rank-1 update of Q [4 x nb+3]
    1444             :         /////////////////////////////////////////////////////
    1445             : 
    1446           0 :         q1 = _AVX_LOAD(&q[0]);
    1447           0 :         q1 = _AVX_SUB(q1, t1);
    1448             :         _AVX_STORE(&q[0],q1);
    1449             : 
    1450           0 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
    1451           0 :         q1 = _AVX_LOAD(&q[ldq]);
    1452           0 :         q1 = _AVX_SUB(q1, v1);
    1453             : #ifdef __ELPA_USE_FMA__
    1454           0 :         q1 = _AVX_NFMA(t1, h6, q1);
    1455             : #else
    1456             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1457             : #endif
    1458           0 :         _AVX_STORE(&q[ldq],q1);
    1459             : 
    1460           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
    1461           0 :         q1 = _AVX_LOAD(&q[ldq*2]);
    1462           0 :         q1 = _AVX_SUB(q1, w1);
    1463             : #ifdef __ELPA_USE_FMA__
    1464           0 :         q1 = _AVX_NFMA(v1, h5, q1);
    1465             : #else
    1466             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1467             : #endif
    1468           0 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
    1469             : #ifdef __ELPA_USE_FMA__
    1470           0 :         q1 = _AVX_NFMA(t1, h6, q1);
    1471             : #else
    1472             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1473             : #endif
    1474           0 :         _AVX_STORE(&q[ldq*2],q1);
    1475             : 
    1476           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
    1477           0 :         q1 = _AVX_LOAD(&q[ldq*3]);
    1478           0 :         q1 = _AVX_SUB(q1, z1);
    1479             : #ifdef __ELPA_USE_FMA__
    1480           0 :         q1 = _AVX_NFMA(w1, h4, q1);
    1481             : #else
    1482             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1483             : #endif
    1484           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
    1485             : #ifdef __ELPA_USE_FMA__
    1486           0 :         q1 = _AVX_NFMA(v1, h5, q1);
    1487             : #else
    1488             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1489             : #endif
    1490           0 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
    1491             : #ifdef __ELPA_USE_FMA__
    1492           0 :         q1 = _AVX_NFMA(t1, h6, q1);
    1493             : #else
    1494             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1495             : #endif
    1496           0 :         _AVX_STORE(&q[ldq*3],q1);
    1497             : 
    1498           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
    1499           0 :         q1 = _AVX_LOAD(&q[ldq*4]);
    1500           0 :         q1 = _AVX_SUB(q1, y1);
    1501             : #ifdef __ELPA_USE_FMA__
    1502           0 :         q1 = _AVX_NFMA(z1, h3, q1);
    1503             : #else
    1504             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1505             : #endif
    1506           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
    1507             : #ifdef __ELPA_USE_FMA__
    1508           0 :         q1 = _AVX_NFMA(w1, h4, q1);
    1509             : #else
    1510             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1511             : #endif
    1512           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
    1513             : #ifdef __ELPA_USE_FMA__
    1514           0 :         q1 = _AVX_NFMA(v1, h5, q1);
    1515             : #else
    1516             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1517             : #endif
    1518           0 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
    1519             : #ifdef __ELPA_USE_FMA__
    1520           0 :         q1 = _AVX_NFMA(t1, h6, q1);
    1521             : #else
    1522             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1523             : #endif
    1524           0 :         _AVX_STORE(&q[ldq*4],q1);
    1525             : 
    1526           0 :         h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
    1527           0 :         q1 = _AVX_LOAD(&q[ldq*5]);
    1528           0 :         q1 = _AVX_SUB(q1, x1);
    1529             : #ifdef __ELPA_USE_FMA__
    1530           0 :         q1 = _AVX_NFMA(y1, h2, q1);
    1531             : #else
    1532             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1533             : #endif
    1534           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
    1535             : #ifdef __ELPA_USE_FMA__
    1536           0 :         q1 = _AVX_NFMA(z1, h3, q1);
    1537             : #else
    1538             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1539             : #endif
    1540           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
    1541             : #ifdef __ELPA_USE_FMA__
    1542           0 :         q1 = _AVX_NFMA(w1, h4, q1);
    1543             : #else
    1544             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1545             : #endif
    1546           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
    1547             : #ifdef __ELPA_USE_FMA__
    1548           0 :         q1 = _AVX_NFMA(v1, h5, q1);
    1549             : #else
    1550             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1551             : #endif
    1552           0 :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
    1553             : #ifdef __ELPA_USE_FMA__
    1554           0 :         q1 = _AVX_NFMA(t1, h6, q1);
    1555             : #else
    1556             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1557             : #endif
    1558           0 :         _AVX_STORE(&q[ldq*5],q1);
    1559             : 
    1560           0 :         for (i = 6; i < nb; i++)
    1561             :         {
    1562           0 :                 q1 = _AVX_LOAD(&q[i*ldq]);
    1563           0 :                 h1 = _AVX_BROADCAST(&hh[i-5]);
    1564             : #ifdef __ELPA_USE_FMA__
    1565           0 :                 q1 = _AVX_NFMA(x1, h1, q1);
    1566             : #else
    1567             :                 q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1568             : #endif
    1569           0 :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
    1570             : #ifdef __ELPA_USE_FMA__
    1571           0 :                 q1 = _AVX_NFMA(y1, h2, q1);
    1572             : #else
    1573             :                 q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1574             : #endif
    1575           0 :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
    1576             : #ifdef __ELPA_USE_FMA__
    1577           0 :                 q1 = _AVX_NFMA(z1, h3, q1);
    1578             : #else
    1579             :                 q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1580             : #endif
    1581           0 :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
    1582             : #ifdef __ELPA_USE_FMA__
    1583           0 :                 q1 = _AVX_NFMA(w1, h4, q1);
    1584             : #else
    1585             :                 q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1586             : #endif
    1587           0 :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
    1588             : #ifdef __ELPA_USE_FMA__
    1589           0 :                 q1 = _AVX_NFMA(v1, h5, q1);
    1590             : #else
    1591             :                 q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1592             : #endif
    1593           0 :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
    1594             : #ifdef __ELPA_USE_FMA__
    1595           0 :                 q1 = _AVX_NFMA(t1, h6, q1);
    1596             : #else
    1597             :                 q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    1598             : #endif
    1599           0 :                 _AVX_STORE(&q[i*ldq],q1);
    1600             :         }
    1601             : 
    1602           0 :         h1 = _AVX_BROADCAST(&hh[nb-5]);
    1603           0 :         q1 = _AVX_LOAD(&q[nb*ldq]);
    1604             : #ifdef __ELPA_USE_FMA__
    1605           0 :         q1 = _AVX_NFMA(x1, h1, q1);
    1606             : #else
    1607             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1608             : #endif
    1609           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
    1610             : #ifdef __ELPA_USE_FMA__
    1611           0 :         q1 = _AVX_NFMA(y1, h2, q1);
    1612             : #else
    1613             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1614             : #endif
    1615           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
    1616             : #ifdef __ELPA_USE_FMA__
    1617           0 :         q1 = _AVX_NFMA(z1, h3, q1);
    1618             : #else
    1619             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1620             : #endif
    1621           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
    1622             : #ifdef __ELPA_USE_FMA__
    1623           0 :         q1 = _AVX_NFMA(w1, h4, q1);
    1624             : #else
    1625             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1626             : #endif
    1627           0 :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
    1628             : #ifdef __ELPA_USE_FMA__
    1629           0 :         q1 = _AVX_NFMA(v1, h5, q1);
    1630             : #else
    1631             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    1632             : #endif
    1633           0 :         _AVX_STORE(&q[nb*ldq],q1);
    1634             : 
    1635           0 :         h1 = _AVX_BROADCAST(&hh[nb-4]);
    1636           0 :         q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
    1637             : #ifdef __ELPA_USE_FMA__
    1638           0 :         q1 = _AVX_NFMA(x1, h1, q1);
    1639             : #else
    1640             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1641             : #endif
    1642           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
    1643             : #ifdef __ELPA_USE_FMA__
    1644           0 :         q1 = _AVX_NFMA(y1, h2, q1);
    1645             : #else
    1646             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1647             : #endif
    1648           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
    1649             : #ifdef __ELPA_USE_FMA__
    1650           0 :         q1 = _AVX_NFMA(z1, h3, q1);
    1651             : #else
    1652             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1653             : #endif
    1654           0 :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
    1655             : #ifdef __ELPA_USE_FMA__
    1656           0 :         q1 = _AVX_NFMA(w1, h4, q1);
    1657             : #else
    1658             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    1659             : #endif
    1660           0 :         _AVX_STORE(&q[(nb+1)*ldq],q1);
    1661             : 
    1662           0 :         h1 = _AVX_BROADCAST(&hh[nb-3]);
    1663           0 :         q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
    1664             : #ifdef __ELPA_USE_FMA__
    1665           0 :         q1 = _AVX_NFMA(x1, h1, q1);
    1666             : #else
    1667             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1668             : #endif
    1669           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
    1670             : #ifdef __ELPA_USE_FMA__
    1671           0 :         q1 = _AVX_NFMA(y1, h2, q1);
    1672             : #else
    1673             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1674             : #endif
    1675           0 :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
    1676             : #ifdef __ELPA_USE_FMA__
    1677           0 :         q1 = _AVX_NFMA(z1, h3, q1);
    1678             : #else
    1679             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    1680             : #endif
    1681           0 :         _AVX_STORE(&q[(nb+2)*ldq],q1);
    1682             : 
    1683           0 :         h1 = _AVX_BROADCAST(&hh[nb-2]);
    1684           0 :         q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
    1685             : #ifdef __ELPA_USE_FMA__
    1686           0 :         q1 = _AVX_NFMA(x1, h1, q1);
    1687             : #else
    1688             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1689             : #endif
    1690           0 :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
    1691             : #ifdef __ELPA_USE_FMA__
    1692           0 :         q1 = _AVX_NFMA(y1, h2, q1);
    1693             : #else
    1694             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    1695             : #endif
    1696           0 :         _AVX_STORE(&q[(nb+3)*ldq],q1);
    1697             : 
    1698           0 :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1699           0 :         q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
    1700             : #ifdef __ELPA_USE_FMA__
    1701           0 :         q1 = _AVX_NFMA(x1, h1, q1);
    1702             : #else
    1703             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    1704             : #endif
    1705           0 :         _AVX_STORE(&q[(nb+4)*ldq],q1);
    1706             : }
    1707             : 
    1708             : #if 0
    1709             : #ifdef SINGLE_PRECISION_REAL
    1710             : /**
    1711             :  * Unrolled kernel that computes
    1712             :  * 4 rows of Q simultaneously, a
    1713             :  * matrix Vector product with two householder
    1714             :  * vectors + a rank 1 upsate is performed
    1715             :  */
    1716             : __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
    1717             : {
    1718             :         /////////////////////////////////////////////////////
    1719             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
    1720             :         // hh contains four householder vectors
    1721             :         /////////////////////////////////////////////////////
    1722             :         int i;
    1723             : 
    1724             :         __AVX_DATATYPE a1_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5]));
    1725             :         __AVX_DATATYPE a2_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4]));
    1726             :         __AVX_DATATYPE a3_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3]));
    1727             :         __AVX_DATATYPE a4_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2]));
    1728             :         __AVX_DATATYPE a5_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq]));
    1729             :         // we just want to load 4 floats not 8
    1730             :         __AVX_DATATYPE a6_1 = _mm256_castps128_ps256(_mm_load_ps(&q[0]));      // q(1,1) | q(2,1) | q(3,1) | q(4,1)
    1731             : 
    1732             :         __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
    1733             :         __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
    1734             :         __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
    1735             :         __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
    1736             :         __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
    1737             : #ifdef __ELPA_USE_FMA__
    1738             :         register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
    1739             :         t1 = _AVX_FMA(a4_1, h_6_4, t1);
    1740             :         t1 = _AVX_FMA(a3_1, h_6_3, t1);
    1741             :         t1 = _AVX_FMA(a2_1, h_6_2, t1);
    1742             :         t1 = _AVX_FMA(a1_1, h_6_1, t1);
    1743             : #else
    1744             :         register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
    1745             :         t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
    1746             :         t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
    1747             :         t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
    1748             :         t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
    1749             : #endif
    1750             :         __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
    1751             :         __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
    1752             :         __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
    1753             :         __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
    1754             : #ifdef __ELPA_USE_FMA__
    1755             :         register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
    1756             :         v1 = _AVX_FMA(a3_1, h_5_3, v1);
    1757             :         v1 = _AVX_FMA(a2_1, h_5_2, v1);
    1758             :         v1 = _AVX_FMA(a1_1, h_5_1, v1);
    1759             : #else
    1760             :         register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
    1761             :         v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
    1762             :         v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
    1763             :         v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
    1764             : #endif
    1765             :         __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
    1766             :         __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
    1767             :         __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
    1768             : #ifdef __ELPA_USE_FMA__
    1769             :         register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
    1770             :         w1 = _AVX_FMA(a2_1, h_4_2, w1);
    1771             :         w1 = _AVX_FMA(a1_1, h_4_1, w1);
    1772             : #else
    1773             :         register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
    1774             :         w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
    1775             :         w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
    1776             : #endif
    1777             :         __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
    1778             :         __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
    1779             :         __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
    1780             : #ifdef __ELPA_USE_FMA__
    1781             :         register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
    1782             :         z1 = _AVX_FMA(a1_1, h_3_1, z1);
    1783             :         register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
    1784             : #else
    1785             :         register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
    1786             :         z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
    1787             :         register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
    1788             : #endif
    1789             :         register __AVX_DATATYPE x1 = a1_1;
    1790             : 
    1791             :         __AVX_DATATYPE q1;
    1792             : 
    1793             :         __AVX_DATATYPE h1;
    1794             :         __AVX_DATATYPE h2;
    1795             :         __AVX_DATATYPE h3;
    1796             :         __AVX_DATATYPE h4;
    1797             :         __AVX_DATATYPE h5;
    1798             :         __AVX_DATATYPE h6;
    1799             : 
    1800             :         for(i = 6; i < nb; i++)
    1801             :         {
    1802             :                 h1 = _AVX_BROADCAST(&hh[i-5]);
    1803             :                 q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq]));        // q(1,i) | q(2,i) ... | q(4,i)
    1804             : #ifdef __ELPA_USE_FMA__
    1805             :                 x1 = _AVX_FMA(q1, h1, x1);
    1806             : #else
    1807             :                 x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1808             : #endif
    1809             :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
    1810             : #ifdef __ELPA_USE_FMA__
    1811             :                 y1 = _AVX_FMA(q1, h2, y1);
    1812             : #else
    1813             :                 y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1814             : #endif
    1815             :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
    1816             : #ifdef __ELPA_USE_FMA__
    1817             :                 z1 = _AVX_FMA(q1, h3, z1);
    1818             : #else
    1819             :                 z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1820             : #endif
    1821             :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
    1822             : #ifdef __ELPA_USE_FMA__
    1823             :                 w1 = _AVX_FMA(q1, h4, w1);
    1824             : #else
    1825             :                 w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1826             : #endif
    1827             :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
    1828             : #ifdef __ELPA_USE_FMA__
    1829             :                 v1 = _AVX_FMA(q1, h5, v1);
    1830             : #else
    1831             :                 v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
    1832             : #endif
    1833             :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
    1834             : #ifdef __ELPA_USE_FMA__
    1835             :                 t1 = _AVX_FMA(q1, h6, t1);
    1836             : #else
    1837             :                 t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
    1838             : #endif
    1839             :         }
    1840             : 
    1841             :         h1 = _AVX_BROADCAST(&hh[nb-5]);
    1842             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq]));
    1843             : #ifdef __ELPA_USE_FMA__
    1844             :         x1 = _AVX_FMA(q1, h1, x1);
    1845             : #else
    1846             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1847             : #endif
    1848             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
    1849             : #ifdef __ELPA_USE_FMA__
    1850             :         y1 = _AVX_FMA(q1, h2, y1);
    1851             : #else
    1852             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1853             : #endif
    1854             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
    1855             : #ifdef __ELPA_USE_FMA__
    1856             :         z1 = _AVX_FMA(q1, h3, z1);
    1857             : #else
    1858             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1859             : #endif
    1860             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
    1861             : #ifdef __ELPA_USE_FMA__
    1862             :         w1 = _AVX_FMA(q1, h4, w1);
    1863             : #else
    1864             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1865             : #endif
    1866             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
    1867             : #ifdef __ELPA_USE_FMA__
    1868             :         v1 = _AVX_FMA(q1, h5, v1);
    1869             : #else
    1870             :         v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
    1871             : #endif
    1872             : 
    1873             :         h1 = _AVX_BROADCAST(&hh[nb-4]);
    1874             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq]));
    1875             : #ifdef __ELPA_USE_FMA__
    1876             :         x1 = _AVX_FMA(q1, h1, x1);
    1877             : #else
    1878             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1879             : #endif
    1880             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
    1881             : #ifdef __ELPA_USE_FMA__
    1882             :         y1 = _AVX_FMA(q1, h2, y1);
    1883             : #else
    1884             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1885             : #endif
    1886             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
    1887             : #ifdef __ELPA_USE_FMA__
    1888             :         z1 = _AVX_FMA(q1, h3, z1);
    1889             : #else
    1890             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1891             : #endif
    1892             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
    1893             : #ifdef __ELPA_USE_FMA__
    1894             :         w1 = _AVX_FMA(q1, h4, w1);
    1895             : #else
    1896             :         w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
    1897             : #endif
    1898             : 
    1899             :         h1 = _AVX_BROADCAST(&hh[nb-3]);
    1900             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq]));
    1901             : #ifdef __ELPA_USE_FMA__
    1902             :         x1 = _AVX_FMA(q1, h1, x1);
    1903             : #else
    1904             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1905             : #endif
    1906             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
    1907             : #ifdef __ELPA_USE_FMA__
    1908             :         y1 = _AVX_FMA(q1, h2, y1);
    1909             : #else
    1910             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1911             : #endif
    1912             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
    1913             : #ifdef __ELPA_USE_FMA__
    1914             :         z1 = _AVX_FMA(q1, h3, z1);
    1915             : #else
    1916             :         z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
    1917             : #endif
    1918             : 
    1919             :         h1 = _AVX_BROADCAST(&hh[nb-2]);
    1920             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq]));
    1921             : #ifdef __ELPA_USE_FMA__
    1922             :         x1 = _AVX_FMA(q1, h1, x1);
    1923             : #else
    1924             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1925             : #endif
    1926             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
    1927             : #ifdef __ELPA_USE_FMA__
    1928             :         y1 = _AVX_FMA(q1, h2, y1);
    1929             : #else
    1930             :         y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
    1931             : #endif
    1932             : 
    1933             :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    1934             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq]));
    1935             : #ifdef __ELPA_USE_FMA__
    1936             :         x1 = _AVX_FMA(q1, h1, x1);
    1937             : #else
    1938             :         x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
    1939             : #endif
    1940             : 
    1941             :         /////////////////////////////////////////////////////
    1942             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
    1943             :         /////////////////////////////////////////////////////
    1944             : 
    1945             :         __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
    1946             :         x1 = _AVX_MUL(x1, tau1);
    1947             : 
    1948             :         __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
    1949             :         __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
    1950             :         h2 = _AVX_MUL(tau2, vs_1_2);
    1951             : #ifdef __ELPA_USE_FMA__
    1952             :         y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
    1953             : #else
    1954             :         y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
    1955             : #endif
    1956             : 
    1957             :         __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
    1958             :         __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
    1959             :         __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
    1960             :         h2 = _AVX_MUL(tau3, vs_1_3);
    1961             :         h3 = _AVX_MUL(tau3, vs_2_3);
    1962             : #ifdef __ELPA_USE_FMA__
    1963             :         z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
    1964             : #else
    1965             :         z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
    1966             : #endif
    1967             : 
    1968             :         __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
    1969             :         __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
    1970             :         __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
    1971             :         h2 = _AVX_MUL(tau4, vs_1_4);
    1972             :         h3 = _AVX_MUL(tau4, vs_2_4);
    1973             :         __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
    1974             :         h4 = _AVX_MUL(tau4, vs_3_4);
    1975             : #ifdef __ELPA_USE_FMA__
    1976             :         w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
    1977             : #else
    1978             :         w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
    1979             : #endif
    1980             : 
    1981             :         __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
    1982             :         __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
    1983             :         __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
    1984             :         h2 = _AVX_MUL(tau5, vs_1_5);
    1985             :         h3 = _AVX_MUL(tau5, vs_2_5);
    1986             :         __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
    1987             :         __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
    1988             :         h4 = _AVX_MUL(tau5, vs_3_5);
    1989             :         h5 = _AVX_MUL(tau5, vs_4_5);
    1990             : #ifdef __ELPA_USE_FMA__
    1991             :         v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
    1992             : #else
    1993             :         v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
    1994             : #endif
    1995             : 
    1996             :         __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
    1997             :         __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
    1998             :         __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
    1999             :         h2 = _AVX_MUL(tau6, vs_1_6);
    2000             :         h3 = _AVX_MUL(tau6, vs_2_6);
    2001             :         __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
    2002             :         __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
    2003             :         __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
    2004             :         h4 = _AVX_MUL(tau6, vs_3_6);
    2005             :         h5 = _AVX_MUL(tau6, vs_4_6);
    2006             :         h6 = _AVX_MUL(tau6, vs_5_6);
    2007             : #ifdef __ELPA_USE_FMA__
    2008             :         t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
    2009             : #else
    2010             :         t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
    2011             : #endif
    2012             : 
    2013             :         /////////////////////////////////////////////////////
    2014             :         // Rank-1 upsate of Q [4 x nb+3]
    2015             :         /////////////////////////////////////////////////////
    2016             : 
    2017             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[0]));
    2018             :         q1 = _AVX_SUB(q1, t1);
    2019             :         _mm_store_ps(&q[0], _mm256_castps256_ps128(q1));
    2020             : 
    2021             :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
    2022             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq]));
    2023             :         q1 = _AVX_SUB(q1, v1);
    2024             : #ifdef __ELPA_USE_FMA__
    2025             :         q1 = _AVX_NFMA(t1, h6, q1);
    2026             : #else
    2027             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2028             : #endif
    2029             :         _mm_store_ps(&q[ldq], _mm256_castps256_ps128(q1));
    2030             : 
    2031             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
    2032             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2]));
    2033             :         q1 = _AVX_SUB(q1, w1);
    2034             : #ifdef __ELPA_USE_FMA__
    2035             :         q1 = _AVX_NFMA(v1, h5, q1);
    2036             : #else
    2037             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2038             : #endif
    2039             :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
    2040             : #ifdef __ELPA_USE_FMA__
    2041             :         q1 = _AVX_NFMA(t1, h6, q1);
    2042             : #else
    2043             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2044             : #endif
    2045             :         _mm_store_ps(&q[ldq*2], _mm256_castps256_ps128(q1));
    2046             : 
    2047             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
    2048             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3]));
    2049             :         q1 = _AVX_SUB(q1, z1);
    2050             : #ifdef __ELPA_USE_FMA__
    2051             :         q1 = _AVX_NFMA(w1, h4, q1);
    2052             : #else
    2053             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2054             : #endif
    2055             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
    2056             : #ifdef __ELPA_USE_FMA__
    2057             :         q1 = _AVX_NFMA(v1, h5, q1);
    2058             : #else
    2059             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2060             : #endif
    2061             :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
    2062             : #ifdef __ELPA_USE_FMA__
    2063             :         q1 = _AVX_NFMA(t1, h6, q1);
    2064             : #else
    2065             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2066             : #endif
    2067             :         _mm_store_ps(&q[ldq*3], _mm256_castps256_ps128(q1));
    2068             : 
    2069             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
    2070             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4]));
    2071             :         q1 = _AVX_SUB(q1, y1);
    2072             : #ifdef __ELPA_USE_FMA__
    2073             :         q1 = _AVX_NFMA(z1, h3, q1);
    2074             : #else
    2075             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2076             : #endif
    2077             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
    2078             : #ifdef __ELPA_USE_FMA__
    2079             :         q1 = _AVX_NFMA(w1, h4, q1);
    2080             : #else
    2081             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2082             : #endif
    2083             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
    2084             : #ifdef __ELPA_USE_FMA__
    2085             :         q1 = _AVX_NFMA(v1, h5, q1);
    2086             : #else
    2087             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2088             : #endif
    2089             :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
    2090             : #ifdef __ELPA_USE_FMA__
    2091             :         q1 = _AVX_NFMA(t1, h6, q1);
    2092             : #else
    2093             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2094             : #endif
    2095             :         _mm_store_ps(&q[ldq*4], _mm256_castps256_ps128(q1));
    2096             : 
    2097             :         h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
    2098             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5]));
    2099             :         q1 = _AVX_SUB(q1, x1);
    2100             : #ifdef __ELPA_USE_FMA__
    2101             :         q1 = _AVX_NFMA(y1, h2, q1);
    2102             : #else
    2103             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2104             : #endif
    2105             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
    2106             : #ifdef __ELPA_USE_FMA__
    2107             :         q1 = _AVX_NFMA(z1, h3, q1);
    2108             : #else
    2109             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2110             : #endif
    2111             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
    2112             : #ifdef __ELPA_USE_FMA__
    2113             :         q1 = _AVX_NFMA(w1, h4, q1);
    2114             : #else
    2115             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2116             : #endif
    2117             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
    2118             : #ifdef __ELPA_USE_FMA__
    2119             :         q1 = _AVX_NFMA(v1, h5, q1);
    2120             : #else
    2121             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2122             : #endif
    2123             :         h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
    2124             : #ifdef __ELPA_USE_FMA__
    2125             :         q1 = _AVX_NFMA(t1, h6, q1);
    2126             : #else
    2127             :         q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2128             : #endif
    2129             :         _mm_store_ps(&q[ldq*5], _mm256_castps256_ps128(q1));
    2130             : 
    2131             :         for (i = 6; i < nb; i++)
    2132             :         {
    2133             :                 q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq]));
    2134             :                 h1 = _AVX_BROADCAST(&hh[i-5]);
    2135             : #ifdef __ELPA_USE_FMA__
    2136             :                 q1 = _AVX_NFMA(x1, h1, q1);
    2137             : #else
    2138             :                 q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2139             : #endif
    2140             :                 h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
    2141             : #ifdef __ELPA_USE_FMA__
    2142             :                 q1 = _AVX_NFMA(y1, h2, q1);
    2143             : #else
    2144             :                 q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2145             : #endif
    2146             :                 h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
    2147             : #ifdef __ELPA_USE_FMA__
    2148             :                 q1 = _AVX_NFMA(z1, h3, q1);
    2149             : #else
    2150             :                 q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2151             : #endif
    2152             :                 h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
    2153             : #ifdef __ELPA_USE_FMA__
    2154             :                 q1 = _AVX_NFMA(w1, h4, q1);
    2155             : #else
    2156             :                 q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2157             : #endif
    2158             :                 h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
    2159             : #ifdef __ELPA_USE_FMA__
    2160             :                 q1 = _AVX_NFMA(v1, h5, q1);
    2161             : #else
    2162             :                 q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2163             : #endif
    2164             :                 h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
    2165             : #ifdef __ELPA_USE_FMA__
    2166             :                 q1 = _AVX_NFMA(t1, h6, q1);
    2167             : #else
    2168             :                 q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
    2169             : #endif
    2170             :                 _mm_store_ps(&q[i*ldq], _mm256_castps256_ps128(q1));
    2171             :         }
    2172             : 
    2173             :         h1 = _AVX_BROADCAST(&hh[nb-5]);
    2174             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq]));
    2175             : #ifdef __ELPA_USE_FMA__
    2176             :         q1 = _AVX_NFMA(x1, h1, q1);
    2177             : #else
    2178             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2179             : #endif
    2180             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
    2181             : #ifdef __ELPA_USE_FMA__
    2182             :         q1 = _AVX_NFMA(y1, h2, q1);
    2183             : #else
    2184             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2185             : #endif
    2186             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
    2187             : #ifdef __ELPA_USE_FMA__
    2188             :         q1 = _AVX_NFMA(z1, h3, q1);
    2189             : #else
    2190             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2191             : #endif
    2192             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
    2193             : #ifdef __ELPA_USE_FMA__
    2194             :         q1 = _AVX_NFMA(w1, h4, q1);
    2195             : #else
    2196             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2197             : #endif
    2198             :         h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
    2199             : #ifdef __ELPA_USE_FMA__
    2200             :         q1 = _AVX_NFMA(v1, h5, q1);
    2201             : #else
    2202             :         q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
    2203             : #endif
    2204             :         _mm_store_ps(&q[nb*ldq], _mm256_castps256_ps128(q1));
    2205             : 
    2206             :         h1 = _AVX_BROADCAST(&hh[nb-4]);
    2207             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq]));
    2208             : #ifdef __ELPA_USE_FMA__
    2209             :         q1 = _AVX_NFMA(x1, h1, q1);
    2210             : #else
    2211             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2212             : #endif
    2213             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
    2214             : #ifdef __ELPA_USE_FMA__
    2215             :         q1 = _AVX_NFMA(y1, h2, q1);
    2216             : #else
    2217             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2218             : #endif
    2219             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
    2220             : #ifdef __ELPA_USE_FMA__
    2221             :         q1 = _AVX_NFMA(z1, h3, q1);
    2222             : #else
    2223             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2224             : #endif
    2225             :         h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
    2226             : #ifdef __ELPA_USE_FMA__
    2227             :         q1 = _AVX_NFMA(w1, h4, q1);
    2228             : #else
    2229             :         q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
    2230             : #endif
    2231             :         _mm_store_ps(&q[(nb+1)*ldq], _mm256_castps256_ps128(q1));
    2232             : 
    2233             :         h1 = _AVX_BROADCAST(&hh[nb-3]);
    2234             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq]));
    2235             : #ifdef __ELPA_USE_FMA__
    2236             :         q1 = _AVX_NFMA(x1, h1, q1);
    2237             : #else
    2238             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2239             : #endif
    2240             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
    2241             : #ifdef __ELPA_USE_FMA__
    2242             :         q1 = _AVX_NFMA(y1, h2, q1);
    2243             : #else
    2244             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2245             : #endif
    2246             :         h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
    2247             : #ifdef __ELPA_USE_FMA__
    2248             :         q1 = _AVX_NFMA(z1, h3, q1);
    2249             : #else
    2250             :         q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
    2251             : #endif
    2252             :         _mm_store_ps(&q[(nb+2)*ldq], _mm256_castps256_ps128(q1));
    2253             : 
    2254             :         h1 = _AVX_BROADCAST(&hh[nb-2]);
    2255             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq]));
    2256             : #ifdef __ELPA_USE_FMA__
    2257             :         q1 = _AVX_NFMA(x1, h1, q1);
    2258             : #else
    2259             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2260             : #endif
    2261             :         h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
    2262             : #ifdef __ELPA_USE_FMA__
    2263             :         q1 = _AVX_NFMA(y1, h2, q1);
    2264             : #else
    2265             :         q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
    2266             : #endif
    2267             :         _mm_store_ps(&q[(nb+3)*ldq], _mm256_castps256_ps128(q1));
    2268             : 
    2269             :         h1 = _AVX_BROADCAST(&hh[nb-1]);
    2270             :         q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq]));
    2271             : #ifdef __ELPA_USE_FMA__
    2272             :         q1 = _AVX_NFMA(x1, h1, q1);
    2273             : #else
    2274             :         q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
    2275             : #endif
    2276             :         _mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1));
    2277             : }
    2278             : #endif
    2279             : #endif

Generated by: LCOV version 1.12