LCOV - code coverage report
Current view: top level - src/elpa2/kernels - real_avx512_6hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 918 1750 52.5 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
      46             : // --------------------------------------------------------------------------------------------------
      47             : 
      48             : #include "config-f90.h"
      49             : #include <x86intrin.h>
      50             : #include <stdio.h>
      51             : #include <stdlib.h>
      52             : 
      53             : #define __forceinline __attribute__((always_inline)) static
      54             : 
      55             : #ifdef DOUBLE_PRECISION_REAL
      56             : #define offset 8
      57             : #define __AVX512_DATATYPE __m512d
      58             : #define _AVX512_LOAD _mm512_load_pd
      59             : #define _AVX512_STORE _mm512_store_pd
      60             : #define _AVX512_SET1 _mm512_set1_pd
      61             : #define _AVX512_MUL _mm512_mul_pd
      62             : #define _AVX512_ADD _mm512_add_pd
      63             : #define _AVX512_SUB _mm512_sub_pd
      64             : 
      65             : #ifdef HAVE_AVX512
      66             : 
      67             : #define __ELPA_USE_FMA__
      68             : #define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
      69             : #define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
      70             : #define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)
      71             : 
      72             : #endif
      73             : 
      74             : #define _AVX512_FMA _mm512_FMA_pd
      75             : #define _AVX512_NFMA _mm512_NFMA_pd
      76             : #define _AVX512_FMSUB _mm512_FMSUB_pd
      77             : #endif /* DOUBLE_PRECISION_REAL */
      78             : 
      79             : #ifdef SINGLE_PRECISION_REAL
      80             : #define offset 16
      81             : #define __AVX512_DATATYPE __m512
      82             : #define _AVX512_LOAD _mm512_load_ps
      83             : #define _AVX512_STORE _mm512_store_ps
      84             : #define _AVX512_SET1 _mm512_set1_ps
      85             : #define _AVX512_MUL _mm512_mul_ps
      86             : #define _AVX512_ADD _mm512_add_ps
      87             : #define _AVX512_SUB _mm512_sub_ps
      88             : 
      89             : #ifdef HAVE_AVX512
      90             : 
      91             : #define __ELPA_USE_FMA__
      92             : #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
      93             : #define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
      94             : #define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
      95             : #endif
      96             : 
      97             : #define _AVX512_FMA _mm512_FMA_ps
      98             : #define _AVX512_NFMA _mm512_NFMA_ps
      99             : #define _AVX512_FMSUB _mm512_FMSUB_ps
     100             : #endif /* SINGLE_PRECISION_REAL */
     101             : 
     102             : 
     103             : 
     104             : //Forward declaration
     105             : #ifdef DOUBLE_PRECISION_REAL
     106             : static void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     107             : static void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     108             : static void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     109             : static void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
     110             : 
     111             : void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     112             : #endif
     113             : 
     114             : #ifdef SINGLE_PRECISION_REAL
     115             : static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     116             : static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     117             : static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     118             : static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
     119             : 
     120             : void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     121             : 
     122             : #endif
     123             : 
     124             : /*
     125             : !f>#if defined(HAVE_AVX512)
     126             : !f> interface
     127             : !f>   subroutine hexa_hh_trafo_real_avx512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
     128             : !f>                             bind(C, name="hexa_hh_trafo_real_avx512_6hv_double")
     129             : !f>     use, intrinsic :: iso_c_binding
     130             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     131             : !f>     type(c_ptr), value      :: q
     132             : !f>     real(kind=c_double)     :: hh(pnb,6)
     133             : !f>   end subroutine
     134             : !f> end interface
     135             : !f>#endif
     136             : */
     137             : 
     138             : 
     139             : /*
     140             : !f>#if defined(HAVE_AVX512)
     141             : !f> interface
     142             : !f>   subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
     143             : !f>                             bind(C, name="hexa_hh_trafo_real_avx512_6hv_single")
     144             : !f>     use, intrinsic :: iso_c_binding
     145             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     146             : !f>     type(c_ptr), value      :: q
     147             : !f>     real(kind=c_float)      :: hh(pnb,6)
     148             : !f>   end subroutine
     149             : !f> end interface
     150             : !f>#endif
     151             : */
     152             : 
     153             : 
     154             : #ifdef DOUBLE_PRECISION_REAL
     155       81920 : void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     156             : #endif
     157             : #ifdef SINGLE_PRECISION_REAL
     158       15360 : void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     159             : #endif
     160             : 
     161             : 
     162             : {
     163             :         int i;
     164       97280 :         int nb = *pnb;
     165       97280 :         int nq = *pldq;
     166       97280 :         int ldq = *pldq;
     167       97280 :         int ldh = *pldh;
     168             :         int worked_on;
     169             : 
     170       97280 :         worked_on = 0;
     171             : 
     172             :         // calculating scalar products to compute
     173             :         // 6 householder vectors simultaneously
     174             : #ifdef DOUBLE_PRECISION_REAL
     175             :         double scalarprods[15];
     176             : #endif
     177             : #ifdef SINGLE_PRECISION_REAL
     178             :         float scalarprods[15];
     179             : #endif
     180             : 
     181       97280 :         scalarprods[0] = hh[(ldh+1)];
     182       97280 :         scalarprods[1] = hh[(ldh*2)+2];
     183       97280 :         scalarprods[2] = hh[(ldh*2)+1];
     184       97280 :         scalarprods[3] = hh[(ldh*3)+3];
     185       97280 :         scalarprods[4] = hh[(ldh*3)+2];
     186       97280 :         scalarprods[5] = hh[(ldh*3)+1];
     187       97280 :         scalarprods[6] = hh[(ldh*4)+4];
     188       97280 :         scalarprods[7] = hh[(ldh*4)+3];
     189       97280 :         scalarprods[8] = hh[(ldh*4)+2];
     190       97280 :         scalarprods[9] = hh[(ldh*4)+1];
     191       97280 :         scalarprods[10] = hh[(ldh*5)+5];
     192       97280 :         scalarprods[11] = hh[(ldh*5)+4];
     193       97280 :         scalarprods[12] = hh[(ldh*5)+3];
     194       97280 :         scalarprods[13] = hh[(ldh*5)+2];
     195       97280 :         scalarprods[14] = hh[(ldh*5)+1];
     196             : 
     197             :         // calculate scalar product of first and fourth householder Vector
     198             :         // loop counter = 2
     199       97280 :         scalarprods[0] += hh[1] * hh[(2+ldh)];
     200       97280 :         scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
     201       97280 :         scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
     202       97280 :         scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
     203       97280 :         scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
     204             : 
     205             :         // loop counter = 3
     206       97280 :         scalarprods[0] += hh[2] * hh[(3+ldh)];
     207       97280 :         scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
     208       97280 :         scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
     209       97280 :         scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
     210       97280 :         scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
     211             : 
     212       97280 :         scalarprods[1] += hh[1] * hh[3+(ldh*2)];
     213       97280 :         scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
     214       97280 :         scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
     215       97280 :         scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
     216             : 
     217             :         // loop counter = 4
     218       97280 :         scalarprods[0] += hh[3] * hh[(4+ldh)];
     219       97280 :         scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
     220       97280 :         scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
     221       97280 :         scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
     222       97280 :         scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
     223             : 
     224       97280 :         scalarprods[1] += hh[2] * hh[4+(ldh*2)];
     225       97280 :         scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
     226       97280 :         scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
     227       97280 :         scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
     228             : 
     229       97280 :         scalarprods[3] += hh[1] * hh[4+(ldh*3)];
     230       97280 :         scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
     231       97280 :         scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
     232             : 
     233             :         // loop counter = 5
     234       97280 :         scalarprods[0] += hh[4] * hh[(5+ldh)];
     235       97280 :         scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
     236       97280 :         scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
     237       97280 :         scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
     238       97280 :         scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
     239             : 
     240       97280 :         scalarprods[1] += hh[3] * hh[5+(ldh*2)];
     241       97280 :         scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
     242       97280 :         scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
     243       97280 :         scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
     244             : 
     245       97280 :         scalarprods[3] += hh[2] * hh[5+(ldh*3)];
     246       97280 :         scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
     247       97280 :         scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
     248             : 
     249       97280 :         scalarprods[6] += hh[1] * hh[5+(ldh*4)];
     250       97280 :         scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
     251             : 
     252             :         #pragma ivdep
     253     5739520 :         for (i = 6; i < nb; i++)
     254             :         {
     255     5642240 :                 scalarprods[0] += hh[i-1] * hh[(i+ldh)];
     256     5642240 :                 scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
     257     5642240 :                 scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
     258     5642240 :                 scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
     259     5642240 :                 scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
     260             : 
     261     5642240 :                 scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
     262     5642240 :                 scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
     263     5642240 :                 scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
     264     5642240 :                 scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
     265             : 
     266     5642240 :                 scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
     267     5642240 :                 scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
     268     5642240 :                 scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
     269             : 
     270     5642240 :                 scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
     271     5642240 :                 scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
     272             : 
     273     5642240 :                 scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
     274             :         }
     275             : 
     276             : 
     277             :         // Production level kernel calls with padding
     278             : #ifdef DOUBLE_PRECISION_REAL
     279      163840 :         for (i = 0; i < nq-24; i+=32)
     280             :         {
     281       81920 :                 hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     282       81920 :                 worked_on += 32;
     283             :         }
     284             : #endif
     285             : #ifdef SINGLE_PRECISION_REAL
     286       30720 :         for (i = 0; i < nq-48; i+=64)
     287             :         {
     288       15360 :                 hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     289       15360 :                 worked_on += 64;
     290             :         }
     291             : #endif
     292       97280 :         if (nq == i)
     293             :         {
     294           0 :                 return;
     295             :         }
     296             : #ifdef DOUBLE_PRECISION_REAL
     297       81920 :         if (nq-i == 24)
     298             :         {
     299           0 :                 hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     300           0 :                 worked_on += 24;
     301             :         }
     302             : #endif
     303             : 
     304             : #ifdef SINGLE_PRECISION_REAL
     305       15360 :         if (nq-i ==48)
     306             :         {
     307           0 :                 hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     308           0 :                 worked_on += 48;
     309             :         }
     310             : #endif
     311             : 
     312             : #ifdef DOUBLE_PRECISION_REAL
     313       81920 :         if (nq-i == 16)
     314             :         {
     315           0 :                 hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     316           0 :                 worked_on += 16;
     317             :         }
     318             : #endif
     319             : 
     320             : #ifdef SINGLE_PRECISION_REAL
     321       15360 :         if (nq-i ==32)
     322             :         {
     323           0 :                 hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     324           0 :                 worked_on += 32;
     325             :         }
     326             : #endif
     327             : 
     328             : #ifdef DOUBLE_PRECISION_REAL
     329       81920 :         if (nq-i == 8)
     330             :         {
     331       81920 :                 hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
     332       81920 :                 worked_on += 8;
     333             :         }
     334             : #endif
     335             : 
     336             : #ifdef SINGLE_PRECISION_REAL
     337       15360 :         if (nq-i == 16)
     338             :         {
     339       15360 :                 hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
     340       15360 :                 worked_on += 16;
     341             :         }
     342             : #endif
     343             : 
     344             : #ifdef WITH_DEBUG
     345             :         if (worked_on != nq)
     346             :         {
     347             :           printf("ERROR in avx512 kernel\n");
     348             :           abort();
     349             :         }
     350             : #endif
     351             : }
     352             : 
     353             : /**
     354             :  * Unrolled kernel that computes
     355             : #ifdef DOUBLE_PRECISION_REAL
     356             :  * 8 rows of Q simultaneously, a
     357             : #endif
     358             : #ifdef SINGLE_PRECISION_REAL
     359             :  * 16 rows of Q simultaneously, a
     360             : #endif
     361             :  * matrix Vector product with two householder
     362             :  * vectors + a rank 1 update is performed
     363             :  */
     364             : #ifdef DOUBLE_PRECISION_REAL
     365             : __forceinline void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
     366             : #endif
     367             : #ifdef SINGLE_PRECISION_REAL
     368             : __forceinline void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
     369             : #endif
     370             : 
     371             : {
     372             :         /////////////////////////////////////////////////////
     373             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
     374             :         // hh contains four householder vectors
     375             :         /////////////////////////////////////////////////////
     376             :         int i;
     377             : 
     378      194560 :         __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
     379      194560 :         __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
     380      194560 :         __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
     381      194560 :         __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
     382      194560 :         __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
     383       97280 :         __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
     384             : 
     385      194560 :         __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
     386      194560 :         __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
     387      194560 :         __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
     388      194560 :         __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
     389      194560 :         __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
     390             : 
     391             : //        register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
     392       97280 :         __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
     393             : 
     394       97280 :         t1 = _AVX512_FMA(a4_1, h_6_4, t1);
     395       97280 :         t1 = _AVX512_FMA(a3_1, h_6_3, t1);
     396       97280 :         t1 = _AVX512_FMA(a2_1, h_6_2, t1);
     397       97280 :         t1 = _AVX512_FMA(a1_1, h_6_1, t1);
     398             : 
     399      194560 :         __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
     400      194560 :         __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
     401      194560 :         __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
     402      194560 :         __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
     403             : 
     404             : //        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
     405       97280 :         __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
     406             : 
     407       97280 :         v1 = _AVX512_FMA(a3_1, h_5_3, v1);
     408       97280 :         v1 = _AVX512_FMA(a2_1, h_5_2, v1);
     409       97280 :         v1 = _AVX512_FMA(a1_1, h_5_1, v1);
     410             : 
     411      194560 :         __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
     412      194560 :         __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
     413      194560 :         __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
     414             : 
     415             : //        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
     416       97280 :         __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
     417             : 
     418       97280 :         w1 = _AVX512_FMA(a2_1, h_4_2, w1);
     419       97280 :         w1 = _AVX512_FMA(a1_1, h_4_1, w1);
     420             : 
     421      194560 :         __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
     422      194560 :         __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
     423      194560 :         __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
     424             : 
     425             : //        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
     426       97280 :         __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
     427             : 
     428       97280 :         z1 = _AVX512_FMA(a1_1, h_3_1, z1);
     429             : //        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
     430       97280 :          __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
     431             : 
     432             : //        register __AVX512_DATATYPE x1 = a1_1;
     433       97280 :         __AVX512_DATATYPE x1 = a1_1;
     434             : 
     435             :         __AVX512_DATATYPE q1;
     436             : 
     437             :         __AVX512_DATATYPE h1;
     438             :         __AVX512_DATATYPE h2;
     439             :         __AVX512_DATATYPE h3;
     440             :         __AVX512_DATATYPE h4;
     441             :         __AVX512_DATATYPE h5;
     442             :         __AVX512_DATATYPE h6;
     443             : 
     444     5739520 :         for(i = 6; i < nb; i++)
     445             :         {
     446    11284480 :                 h1 = _AVX512_SET1(hh[i-5]);
     447    11284480 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     448             : 
     449     5642240 :                 x1 = _AVX512_FMA(q1, h1, x1);
     450             : 
     451    11284480 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
     452             : 
     453     5642240 :                 y1 = _AVX512_FMA(q1, h2, y1);
     454    11284480 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
     455             : 
     456     5642240 :                 z1 = _AVX512_FMA(q1, h3, z1);
     457             : 
     458    11284480 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
     459             : 
     460     5642240 :                 w1 = _AVX512_FMA(q1, h4, w1);
     461             : 
     462    11284480 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
     463             : 
     464     5642240 :                 v1 = _AVX512_FMA(q1, h5, v1);
     465             : 
     466    11284480 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
     467             : 
     468     5642240 :                 t1 = _AVX512_FMA(q1, h6, t1);
     469             :         }
     470             : 
     471      194560 :         h1 = _AVX512_SET1(hh[nb-5]);
     472      194560 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     473             : 
     474       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
     475             : 
     476      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
     477             : 
     478       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
     479             : 
     480      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
     481             : 
     482       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
     483             : 
     484      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
     485             : 
     486       97280 :         w1 = _AVX512_FMA(q1, h4, w1);
     487             : 
     488      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
     489             : 
     490       97280 :         v1 = _AVX512_FMA(q1, h5, v1);
     491             : 
     492      194560 :         h1 = _AVX512_SET1(hh[nb-4]);
     493             : 
     494      194560 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
     495             : 
     496       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
     497             : 
     498      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
     499             : 
     500       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
     501             : 
     502      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
     503             : 
     504       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
     505             : 
     506      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
     507             : 
     508       97280 :         w1 = _AVX512_FMA(q1, h4, w1);
     509             : 
     510      194560 :         h1 = _AVX512_SET1(hh[nb-3]);
     511      194560 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
     512             : 
     513       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
     514             : 
     515      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
     516             : 
     517       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
     518             : 
     519      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
     520             : 
     521       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
     522             : 
     523      194560 :         h1 = _AVX512_SET1(hh[nb-2]);
     524      194560 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
     525             : 
     526       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
     527             : 
     528      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
     529             : 
     530       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
     531             : 
     532      194560 :         h1 = _AVX512_SET1(hh[nb-1]);
     533      194560 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
     534             : 
     535       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
     536             : 
     537             :         /////////////////////////////////////////////////////
     538             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
     539             :         /////////////////////////////////////////////////////
     540             : 
     541      194560 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
     542       97280 :         x1 = _AVX512_MUL(x1, tau1);
     543             : 
     544      194560 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
     545      194560 :         __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
     546       97280 :         h2 = _AVX512_MUL(tau2, vs_1_2);
     547             : 
     548      194560 :         y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
     549             : 
     550      194560 :         __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
     551      194560 :         __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
     552      194560 :         __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
     553             : 
     554       97280 :         h2 = _AVX512_MUL(tau3, vs_1_3);
     555       97280 :         h3 = _AVX512_MUL(tau3, vs_2_3);
     556             : 
     557      291840 :         z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
     558             : 
     559      194560 :         __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
     560      194560 :         __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
     561      194560 :         __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
     562             : 
     563       97280 :         h2 = _AVX512_MUL(tau4, vs_1_4);
     564       97280 :         h3 = _AVX512_MUL(tau4, vs_2_4);
     565             : 
     566      194560 :         __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
     567       97280 :         h4 = _AVX512_MUL(tau4, vs_3_4);
     568             : 
     569      389120 :         w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
     570             : 
     571      194560 :         __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
     572      194560 :         __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
     573      194560 :         __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
     574             : 
     575       97280 :         h2 = _AVX512_MUL(tau5, vs_1_5);
     576       97280 :         h3 = _AVX512_MUL(tau5, vs_2_5);
     577             : 
     578      194560 :         __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
     579      194560 :         __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
     580             : 
     581       97280 :         h4 = _AVX512_MUL(tau5, vs_3_5);
     582       97280 :         h5 = _AVX512_MUL(tau5, vs_4_5);
     583             : 
     584      583680 :         v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
     585             : 
     586      194560 :         __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
     587      194560 :         __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
     588      194560 :         __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
     589       97280 :         h2 = _AVX512_MUL(tau6, vs_1_6);
     590       97280 :         h3 = _AVX512_MUL(tau6, vs_2_6);
     591             : 
     592      194560 :         __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
     593      194560 :         __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
     594      194560 :         __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
     595             : 
     596       97280 :         h4 = _AVX512_MUL(tau6, vs_3_6);
     597       97280 :         h5 = _AVX512_MUL(tau6, vs_4_6);
     598       97280 :         h6 = _AVX512_MUL(tau6, vs_5_6);
     599             : 
     600      680960 :         t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
     601             : 
     602             :         /////////////////////////////////////////////////////
     603             :         // Rank-1 update of Q [8 x nb+3]
     604             :         /////////////////////////////////////////////////////
     605             : 
     606       97280 :         q1 = _AVX512_LOAD(&q[0]);
     607       97280 :         q1 = _AVX512_SUB(q1, t1);
     608             :         _AVX512_STORE(&q[0],q1);
     609             : 
     610      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+1]);
     611      194560 :         q1 = _AVX512_LOAD(&q[ldq]);
     612       97280 :         q1 = _AVX512_SUB(q1, v1);
     613             : 
     614       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
     615             : 
     616       97280 :         _AVX512_STORE(&q[ldq],q1);
     617             : 
     618      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+1]);
     619      194560 :         q1 = _AVX512_LOAD(&q[ldq*2]);
     620       97280 :         q1 = _AVX512_SUB(q1, w1);
     621             : 
     622       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
     623             : 
     624      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+2]);
     625             : 
     626       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
     627             : 
     628       97280 :         _AVX512_STORE(&q[ldq*2],q1);
     629             : 
     630      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+1]);
     631      194560 :         q1 = _AVX512_LOAD(&q[ldq*3]);
     632       97280 :         q1 = _AVX512_SUB(q1, z1);
     633             : 
     634       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
     635             : 
     636      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+2]);
     637             : 
     638       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
     639             : 
     640      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+3]);
     641             : 
     642       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
     643             : 
     644       97280 :         _AVX512_STORE(&q[ldq*3],q1);
     645             : 
     646      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+1]);
     647      194560 :         q1 = _AVX512_LOAD(&q[ldq*4]);
     648       97280 :         q1 = _AVX512_SUB(q1, y1);
     649             : 
     650       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
     651             : 
     652      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+2]);
     653             : 
     654       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
     655             : 
     656      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+3]);
     657             : 
     658       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
     659             : 
     660      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+4]);
     661             : 
     662       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
     663             : 
     664       97280 :         _AVX512_STORE(&q[ldq*4],q1);
     665             : 
     666      194560 :         h2 = _AVX512_SET1(hh[(ldh)+1]);
     667      194560 :         q1 = _AVX512_LOAD(&q[ldq*5]);
     668       97280 :         q1 = _AVX512_SUB(q1, x1);
     669             : 
     670       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
     671             : 
     672      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+2]);
     673             : 
     674       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
     675             : 
     676      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+3]);
     677             : 
     678       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
     679             : 
     680      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+4]);
     681             : 
     682       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
     683             : 
     684      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+5]);
     685             : 
     686       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
     687             : 
     688       97280 :         _AVX512_STORE(&q[ldq*5],q1);
     689             : 
     690     5739520 :         for (i = 6; i < nb; i++)
     691             :         {
     692    11284480 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     693    11284480 :                 h1 = _AVX512_SET1(hh[i-5]);
     694             : 
     695     5642240 :                 q1 = _AVX512_NFMA(x1, h1, q1);
     696             : 
     697    11284480 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
     698             : 
     699     5642240 :                 q1 = _AVX512_NFMA(y1, h2, q1);
     700             : 
     701    11284480 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
     702             : 
     703     5642240 :                 q1 = _AVX512_NFMA(z1, h3, q1);
     704             : 
     705    11284480 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
     706             : 
     707     5642240 :                 q1 = _AVX512_NFMA(w1, h4, q1);
     708             : 
     709    11284480 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
     710             : 
     711     5642240 :                 q1 = _AVX512_NFMA(v1, h5, q1);
     712             : 
     713    11284480 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
     714             : 
     715     5642240 :                 q1 = _AVX512_NFMA(t1, h6, q1);
     716             : 
     717     5642240 :                 _AVX512_STORE(&q[i*ldq],q1);
     718             :         }
     719             : 
     720      194560 :         h1 = _AVX512_SET1(hh[nb-5]);
     721      194560 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     722             : 
     723       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
     724             : 
     725      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
     726             : 
     727       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
     728             : 
     729      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
     730             : 
     731       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
     732             : 
     733      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
     734             : 
     735       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
     736             : 
     737      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
     738             : 
     739       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
     740             : 
     741       97280 :         _AVX512_STORE(&q[nb*ldq],q1);
     742             : 
     743      194560 :         h1 = _AVX512_SET1(hh[nb-4]);
     744      194560 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
     745             : 
     746       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
     747             : 
     748      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
     749             : 
     750       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
     751             : 
     752      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
     753             : 
     754       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
     755             : 
     756      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
     757             : 
     758       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
     759             : 
     760       97280 :         _AVX512_STORE(&q[(nb+1)*ldq],q1);
     761             : 
     762      194560 :         h1 = _AVX512_SET1(hh[nb-3]);
     763      194560 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
     764             : 
     765       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
     766             : 
     767      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
     768             : 
     769       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
     770             : 
     771      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
     772             : 
     773       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
     774             : 
     775       97280 :         _AVX512_STORE(&q[(nb+2)*ldq],q1);
     776             : 
     777      194560 :         h1 = _AVX512_SET1(hh[nb-2]);
     778      194560 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
     779             : 
     780       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
     781             : 
     782      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
     783             : 
     784       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
     785             : 
     786       97280 :         _AVX512_STORE(&q[(nb+3)*ldq],q1);
     787             : 
     788      194560 :         h1 = _AVX512_SET1(hh[nb-1]);
     789      194560 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
     790             : 
     791       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
     792             : 
     793       97280 :         _AVX512_STORE(&q[(nb+4)*ldq],q1);
     794             : }
     795             : 
     796             : /**
     797             :  * Unrolled kernel that computes
     798             : #ifdef DOUBLE_PRECISION_REAL
     799             :  * 16 rows of Q simultaneously, a
     800             : #endif
     801             : #ifdef SINGLE_PRECISION_REAL
     802             :  * 32 rows of Q simultaneously, a
     803             : #endif
     804             :  * matrix Vector product with two householder
     805             :  * vectors + a rank 1 update is performed
     806             :  */
     807             : #ifdef DOUBLE_PRECISION_REAL
     808             : __forceinline void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
     809             : #endif
     810             : #ifdef SINGLE_PRECISION_REAL
     811             : __forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
     812             : #endif
     813             : {
     814             :         /////////////////////////////////////////////////////
     815             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
     816             :         // hh contains four householder vectors
     817             :         /////////////////////////////////////////////////////
     818             :         int i;
     819             : 
     820           0 :         __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
     821           0 :         __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
     822           0 :         __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
     823           0 :         __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
     824           0 :         __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
     825           0 :         __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
     826             : 
     827           0 :         __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
     828           0 :         __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
     829           0 :         __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
     830           0 :         __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
     831           0 :         __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
     832             : 
     833             : //        register__AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
     834           0 :         __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
     835             : 
     836           0 :         t1 = _AVX512_FMA(a4_1, h_6_4, t1);
     837           0 :         t1 = _AVX512_FMA(a3_1, h_6_3, t1);
     838           0 :         t1 = _AVX512_FMA(a2_1, h_6_2, t1);
     839           0 :         t1 = _AVX512_FMA(a1_1, h_6_1, t1);
     840             : 
     841           0 :         __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
     842           0 :         __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
     843           0 :         __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
     844           0 :         __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
     845             : 
     846             : //        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
     847           0 :         __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
     848             : 
     849           0 :         v1 = _AVX512_FMA(a3_1, h_5_3, v1);
     850           0 :         v1 = _AVX512_FMA(a2_1, h_5_2, v1);
     851           0 :         v1 = _AVX512_FMA(a1_1, h_5_1, v1);
     852             : 
     853           0 :         __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
     854           0 :         __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
     855           0 :         __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
     856             : 
     857             : //        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
     858           0 :         __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
     859             : 
     860           0 :         w1 = _AVX512_FMA(a2_1, h_4_2, w1);
     861           0 :         w1 = _AVX512_FMA(a1_1, h_4_1, w1);
     862             : 
     863           0 :         __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
     864           0 :         __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
     865           0 :         __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
     866             : 
     867             : //        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
     868           0 :         __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
     869             : 
     870           0 :         z1 = _AVX512_FMA(a1_1, h_3_1, z1);
     871             : //        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
     872           0 :         __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
     873             : 
     874             : 
     875             : //        register __AVX512_DATATYPE x1 = a1_1;
     876           0 :         __AVX512_DATATYPE x1 = a1_1;
     877             : 
     878             : 
     879           0 :         __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
     880           0 :         __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
     881           0 :         __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
     882           0 :         __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
     883           0 :         __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
     884           0 :         __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
     885             : 
     886             : //        register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
     887           0 :         __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
     888             : 
     889           0 :         t2 = _AVX512_FMA(a4_2, h_6_4, t2);
     890           0 :         t2 = _AVX512_FMA(a3_2, h_6_3, t2);
     891           0 :         t2 = _AVX512_FMA(a2_2, h_6_2, t2);
     892           0 :         t2 = _AVX512_FMA(a1_2, h_6_1, t2);
     893             : 
     894             : //        register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
     895           0 :         __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
     896             : 
     897           0 :         v2 = _AVX512_FMA(a3_2, h_5_3, v2);
     898           0 :         v2 = _AVX512_FMA(a2_2, h_5_2, v2);
     899           0 :         v2 = _AVX512_FMA(a1_2, h_5_1, v2);
     900             : 
     901             : //        register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
     902           0 :         __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
     903             : 
     904           0 :         w2 = _AVX512_FMA(a2_2, h_4_2, w2);
     905           0 :         w2 = _AVX512_FMA(a1_2, h_4_1, w2);
     906             : 
     907             : //        register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
     908           0 :         __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
     909             : 
     910           0 :         z2 = _AVX512_FMA(a1_2, h_3_1, z2);
     911             : //        register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
     912           0 :         __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
     913             : 
     914             : 
     915             : //        register __AVX512_DATATYPE x2 = a1_2;
     916           0 :         __AVX512_DATATYPE x2 = a1_2;
     917             : 
     918             :         __AVX512_DATATYPE q1;
     919             :         __AVX512_DATATYPE q2;
     920             : 
     921             :         __AVX512_DATATYPE h1;
     922             :         __AVX512_DATATYPE h2;
     923             :         __AVX512_DATATYPE h3;
     924             :         __AVX512_DATATYPE h4;
     925             :         __AVX512_DATATYPE h5;
     926             :         __AVX512_DATATYPE h6;
     927             : 
     928           0 :         for(i = 6; i < nb; i++)
     929             :         {
     930           0 :                 h1 = _AVX512_SET1(hh[i-5]);
     931           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     932           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     933             : 
     934           0 :                 x1 = _AVX512_FMA(q1, h1, x1);
     935           0 :                 x2 = _AVX512_FMA(q2, h1, x2);
     936             : 
     937           0 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
     938             : 
     939           0 :                 y1 = _AVX512_FMA(q1, h2, y1);
     940           0 :                 y2 = _AVX512_FMA(q2, h2, y2);
     941             : 
     942           0 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
     943             : 
     944           0 :                 z1 = _AVX512_FMA(q1, h3, z1);
     945           0 :                 z2 = _AVX512_FMA(q2, h3, z2);
     946             : 
     947           0 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
     948             : 
     949           0 :                 w1 = _AVX512_FMA(q1, h4, w1);
     950           0 :                 w2 = _AVX512_FMA(q2, h4, w2);
     951             : 
     952           0 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
     953             : 
     954           0 :                 v1 = _AVX512_FMA(q1, h5, v1);
     955           0 :                 v2 = _AVX512_FMA(q2, h5, v2);
     956             : 
     957           0 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
     958             : 
     959           0 :                 t1 = _AVX512_FMA(q1, h6, t1);
     960           0 :                 t2 = _AVX512_FMA(q2, h6, t2);
     961             :         }
     962             : 
     963           0 :         h1 = _AVX512_SET1(hh[nb-5]);
     964           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     965           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     966             : 
     967           0 :         x1 = _AVX512_FMA(q1, h1, x1);
     968           0 :         x2 = _AVX512_FMA(q2, h1, x2);
     969             : 
     970           0 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
     971             : 
     972           0 :         y1 = _AVX512_FMA(q1, h2, y1);
     973           0 :         y2 = _AVX512_FMA(q2, h2, y2);
     974             : 
     975           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
     976             : 
     977           0 :         z1 = _AVX512_FMA(q1, h3, z1);
     978           0 :         z2 = _AVX512_FMA(q2, h3, z2);
     979             : 
     980           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
     981             : 
     982           0 :         w1 = _AVX512_FMA(q1, h4, w1);
     983           0 :         w2 = _AVX512_FMA(q2, h4, w2);
     984             : 
     985           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
     986             : 
     987           0 :         v1 = _AVX512_FMA(q1, h5, v1);
     988           0 :         v2 = _AVX512_FMA(q2, h5, v2);
     989             : 
     990           0 :         h1 = _AVX512_SET1(hh[nb-4]);
     991             : 
     992           0 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
     993           0 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
     994             : 
     995           0 :         x1 = _AVX512_FMA(q1, h1, x1);
     996           0 :         x2 = _AVX512_FMA(q2, h1, x2);
     997             : 
     998           0 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
     999             : 
    1000           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1001           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1002             : 
    1003           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    1004             : 
    1005           0 :         z1 = _AVX512_FMA(q1, h3, z1);
    1006           0 :         z2 = _AVX512_FMA(q2, h3, z2);
    1007             : 
    1008           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    1009             : 
    1010           0 :         w1 = _AVX512_FMA(q1, h4, w1);
    1011           0 :         w2 = _AVX512_FMA(q2, h4, w2);
    1012             : 
    1013           0 :         h1 = _AVX512_SET1(hh[nb-3]);
    1014           0 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    1015           0 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    1016             : 
    1017           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1018           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1019             : 
    1020           0 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    1021             : 
    1022           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1023           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1024             : 
    1025           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    1026             : 
    1027           0 :         z1 = _AVX512_FMA(q1, h3, z1);
    1028           0 :         z2 = _AVX512_FMA(q2, h3, z2);
    1029             : 
    1030           0 :         h1 = _AVX512_SET1(hh[nb-2]);
    1031           0 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    1032           0 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    1033             : 
    1034           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1035           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1036             : 
    1037           0 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    1038             : 
    1039           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1040           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1041             : 
    1042           0 :         h1 = _AVX512_SET1(hh[nb-1]);
    1043           0 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    1044           0 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    1045             : 
    1046           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1047           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1048             : 
    1049             :         /////////////////////////////////////////////////////
    1050             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
    1051             :         /////////////////////////////////////////////////////
    1052             : 
    1053           0 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
    1054           0 :         x1 = _AVX512_MUL(x1, tau1);
    1055           0 :         x2 = _AVX512_MUL(x2, tau1);
    1056             : 
    1057           0 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
    1058           0 :         __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
    1059           0 :         h2 = _AVX512_MUL(tau2, vs_1_2);
    1060             : 
    1061           0 :         y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
    1062           0 :         y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
    1063             : 
    1064           0 :         __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
    1065           0 :         __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
    1066           0 :         __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
    1067             : 
    1068           0 :         h2 = _AVX512_MUL(tau3, vs_1_3);
    1069           0 :         h3 = _AVX512_MUL(tau3, vs_2_3);
    1070             : 
    1071           0 :         z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
    1072           0 :         z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
    1073             : 
    1074           0 :         __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
    1075           0 :         __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
    1076           0 :         __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
    1077             : 
    1078           0 :         h2 = _AVX512_MUL(tau4, vs_1_4);
    1079           0 :         h3 = _AVX512_MUL(tau4, vs_2_4);
    1080             : 
    1081           0 :         __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
    1082           0 :         h4 = _AVX512_MUL(tau4, vs_3_4);
    1083             : 
    1084           0 :         w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    1085           0 :         w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    1086             : 
    1087           0 :         __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
    1088           0 :         __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
    1089           0 :         __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
    1090             : 
    1091           0 :         h2 = _AVX512_MUL(tau5, vs_1_5);
    1092           0 :         h3 = _AVX512_MUL(tau5, vs_2_5);
    1093             : 
    1094           0 :         __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
    1095           0 :         __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
    1096             : 
    1097           0 :         h4 = _AVX512_MUL(tau5, vs_3_5);
    1098           0 :         h5 = _AVX512_MUL(tau5, vs_4_5);
    1099             : 
    1100           0 :         v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    1101           0 :         v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    1102             : 
    1103           0 :         __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
    1104           0 :         __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
    1105           0 :         __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
    1106           0 :         h2 = _AVX512_MUL(tau6, vs_1_6);
    1107           0 :         h3 = _AVX512_MUL(tau6, vs_2_6);
    1108             : 
    1109           0 :         __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
    1110           0 :         __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
    1111           0 :         __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
    1112             : 
    1113           0 :         h4 = _AVX512_MUL(tau6, vs_3_6);
    1114           0 :         h5 = _AVX512_MUL(tau6, vs_4_6);
    1115           0 :         h6 = _AVX512_MUL(tau6, vs_5_6);
    1116             : 
    1117           0 :         t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
    1118           0 :         t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
    1119             : 
    1120             :         /////////////////////////////////////////////////////
    1121             :         // Rank-1 update of Q [8 x nb+3]
    1122             :         /////////////////////////////////////////////////////
    1123             : 
    1124           0 :         q1 = _AVX512_LOAD(&q[0]);
    1125           0 :         q2 = _AVX512_LOAD(&q[0+offset]);
    1126             : 
    1127           0 :         q1 = _AVX512_SUB(q1, t1);
    1128           0 :         q2 = _AVX512_SUB(q2, t2);
    1129             : 
    1130             :         _AVX512_STORE(&q[0],q1);
    1131           0 :         _AVX512_STORE(&q[0+offset],q2);
    1132             : 
    1133           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+1]);
    1134           0 :         q1 = _AVX512_LOAD(&q[ldq]);
    1135           0 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
    1136             : 
    1137           0 :         q1 = _AVX512_SUB(q1, v1);
    1138           0 :         q2 = _AVX512_SUB(q2, v2);
    1139             : 
    1140           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1141           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1142             : 
    1143           0 :         _AVX512_STORE(&q[ldq],q1);
    1144           0 :         _AVX512_STORE(&q[ldq+offset],q2);
    1145             : 
    1146           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+1]);
    1147           0 :         q1 = _AVX512_LOAD(&q[ldq*2]);
    1148           0 :         q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
    1149             : 
    1150           0 :         q1 = _AVX512_SUB(q1, w1);
    1151           0 :         q2 = _AVX512_SUB(q2, w2);
    1152             : 
    1153           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1154           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1155             : 
    1156           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+2]);
    1157             : 
    1158           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1159           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1160             : 
    1161           0 :         _AVX512_STORE(&q[ldq*2],q1);
    1162           0 :         _AVX512_STORE(&q[(ldq*2)+offset],q2);
    1163             : 
    1164           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+1]);
    1165           0 :         q1 = _AVX512_LOAD(&q[ldq*3]);
    1166           0 :         q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
    1167             : 
    1168           0 :         q1 = _AVX512_SUB(q1, z1);
    1169           0 :         q2 = _AVX512_SUB(q2, z2);
    1170             : 
    1171           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1172           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1173             : 
    1174           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+2]);
    1175             : 
    1176           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1177           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1178             : 
    1179           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+3]);
    1180             : 
    1181           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1182           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1183             : 
    1184           0 :         _AVX512_STORE(&q[ldq*3],q1);
    1185           0 :         _AVX512_STORE(&q[(ldq*3)+offset],q2);
    1186             : 
    1187           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+1]);
    1188           0 :         q1 = _AVX512_LOAD(&q[ldq*4]);
    1189           0 :         q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
    1190             : 
    1191           0 :         q1 = _AVX512_SUB(q1, y1);
    1192           0 :         q2 = _AVX512_SUB(q2, y2);
    1193             : 
    1194           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1195           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1196             : 
    1197           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+2]);
    1198             : 
    1199           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1200           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1201             : 
    1202           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+3]);
    1203             : 
    1204           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1205           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1206             : 
    1207           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+4]);
    1208             : 
    1209           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1210           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1211             : 
    1212           0 :         _AVX512_STORE(&q[ldq*4],q1);
    1213           0 :         _AVX512_STORE(&q[(ldq*4)+offset],q2);
    1214             : 
    1215           0 :         h2 = _AVX512_SET1(hh[(ldh)+1]);
    1216           0 :         q1 = _AVX512_LOAD(&q[ldq*5]);
    1217           0 :         q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
    1218             : 
    1219           0 :         q1 = _AVX512_SUB(q1, x1);
    1220           0 :         q2 = _AVX512_SUB(q2, x2);
    1221             : 
    1222           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1223           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1224             : 
    1225           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+2]);
    1226             : 
    1227           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1228           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1229             : 
    1230           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+3]);
    1231             : 
    1232           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1233           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1234             : 
    1235           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+4]);
    1236             : 
    1237           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1238           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1239             : 
    1240           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+5]);
    1241             : 
    1242           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1243           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1244             : 
    1245           0 :         _AVX512_STORE(&q[ldq*5],q1);
    1246           0 :         _AVX512_STORE(&q[(ldq*5)+offset],q2);
    1247             : 
    1248           0 :         for (i = 6; i < nb; i++)
    1249             :         {
    1250           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
    1251           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
    1252             : 
    1253           0 :                 h1 = _AVX512_SET1(hh[i-5]);
    1254             : 
    1255           0 :                 q1 = _AVX512_NFMA(x1, h1, q1);
    1256           0 :                 q2 = _AVX512_NFMA(x2, h1, q2);
    1257             : 
    1258           0 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
    1259             : 
    1260           0 :                 q1 = _AVX512_NFMA(y1, h2, q1);
    1261           0 :                 q2 = _AVX512_NFMA(y2, h2, q2);
    1262             : 
    1263           0 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
    1264             : 
    1265           0 :                 q1 = _AVX512_NFMA(z1, h3, q1);
    1266           0 :                 q2 = _AVX512_NFMA(z2, h3, q2);
    1267             : 
    1268           0 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
    1269             : 
    1270           0 :                 q1 = _AVX512_NFMA(w1, h4, q1);
    1271           0 :                 q2 = _AVX512_NFMA(w2, h4, q2);
    1272             : 
    1273           0 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
    1274             : 
    1275           0 :                 q1 = _AVX512_NFMA(v1, h5, q1);
    1276           0 :                 q2 = _AVX512_NFMA(v2, h5, q2);
    1277             : 
    1278           0 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
    1279             : 
    1280           0 :                 q1 = _AVX512_NFMA(t1, h6, q1);
    1281           0 :                 q2 = _AVX512_NFMA(t2, h6, q2);
    1282             : 
    1283           0 :                 _AVX512_STORE(&q[i*ldq],q1);
    1284           0 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
    1285             : 
    1286             :         }
    1287             : 
    1288           0 :         h1 = _AVX512_SET1(hh[nb-5]);
    1289           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
    1290           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
    1291             : 
    1292           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    1293           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    1294             : 
    1295           0 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
    1296             : 
    1297           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1298           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1299             : 
    1300           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
    1301             : 
    1302           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1303           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1304             : 
    1305           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
    1306             : 
    1307           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1308           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1309             : 
    1310           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
    1311             : 
    1312           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1313           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1314             : 
    1315           0 :         _AVX512_STORE(&q[nb*ldq],q1);
    1316           0 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
    1317             : 
    1318           0 :         h1 = _AVX512_SET1(hh[nb-4]);
    1319           0 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
    1320           0 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
    1321             : 
    1322           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    1323           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    1324             : 
    1325           0 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
    1326             : 
    1327           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1328           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1329             : 
    1330           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    1331             : 
    1332           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1333           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1334             : 
    1335           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    1336             : 
    1337           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1338           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1339             : 
    1340           0 :         _AVX512_STORE(&q[(nb+1)*ldq],q1);
    1341           0 :         _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
    1342             : 
    1343           0 :         h1 = _AVX512_SET1(hh[nb-3]);
    1344           0 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    1345           0 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    1346             : 
    1347           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    1348           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    1349             : 
    1350           0 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    1351             : 
    1352           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1353           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1354             : 
    1355           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    1356             : 
    1357           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1358           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1359             : 
    1360           0 :         _AVX512_STORE(&q[(nb+2)*ldq],q1);
    1361           0 :         _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
    1362             : 
    1363           0 :         h1 = _AVX512_SET1(hh[nb-2]);
    1364           0 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    1365           0 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    1366             : 
    1367           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    1368           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    1369             : 
    1370           0 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    1371             : 
    1372           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1373           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1374             : 
    1375           0 :         _AVX512_STORE(&q[(nb+3)*ldq],q1);
    1376           0 :         _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
    1377             : 
    1378           0 :         h1 = _AVX512_SET1(hh[nb-1]);
    1379           0 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    1380           0 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    1381             : 
    1382           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    1383           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    1384             : 
    1385           0 :         _AVX512_STORE(&q[(nb+4)*ldq],q1);
    1386           0 :         _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
    1387             : 
    1388             : }
    1389             : 
    1390             : /**
    1391             :  * Unrolled kernel that computes
    1392             : #ifdef DOUBLE_PRECISION_REAL
    1393             :  * 24 rows of Q simultaneously, a
    1394             : #endif
    1395             : #ifdef DOUBLE_PRECISION_REAL
    1396             :  * 48 rows of Q simultaneously, a
    1397             : #endif
    1398             : 
    1399             :  * matrix Vector product with two householder
    1400             :  * vectors + a rank 1 update is performed
    1401             :  */
    1402             : #ifdef DOUBLE_PRECISION_REAL
    1403             : __forceinline void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
    1404             : #endif
    1405             : #ifdef SINGLE_PRECISION_REAL
    1406             : __forceinline void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
    1407             : #endif
    1408             : {
    1409             :         /////////////////////////////////////////////////////
    1410             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
    1411             :         // hh contains four householder vectors
    1412             :         /////////////////////////////////////////////////////
    1413             :         int i;
    1414             : 
    1415           0 :         __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
    1416           0 :         __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
    1417           0 :         __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
    1418           0 :         __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
    1419           0 :         __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
    1420           0 :         __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
    1421             : 
    1422           0 :         __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
    1423           0 :         __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
    1424           0 :         __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
    1425           0 :         __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
    1426           0 :         __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
    1427             : 
    1428             : //        register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
    1429           0 :          __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
    1430             : 
    1431           0 :         t1 = _AVX512_FMA(a4_1, h_6_4, t1);
    1432           0 :         t1 = _AVX512_FMA(a3_1, h_6_3, t1);
    1433           0 :         t1 = _AVX512_FMA(a2_1, h_6_2, t1);
    1434           0 :         t1 = _AVX512_FMA(a1_1, h_6_1, t1);
    1435             : 
    1436           0 :         __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
    1437           0 :         __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
    1438           0 :         __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
    1439           0 :         __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
    1440             : 
    1441             : //        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
    1442           0 :          __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
    1443             : 
    1444           0 :         v1 = _AVX512_FMA(a3_1, h_5_3, v1);
    1445           0 :         v1 = _AVX512_FMA(a2_1, h_5_2, v1);
    1446           0 :         v1 = _AVX512_FMA(a1_1, h_5_1, v1);
    1447             : 
    1448           0 :         __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
    1449           0 :         __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
    1450           0 :         __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
    1451             : 
    1452             : //        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
    1453           0 :         __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
    1454             : 
    1455           0 :         w1 = _AVX512_FMA(a2_1, h_4_2, w1);
    1456           0 :         w1 = _AVX512_FMA(a1_1, h_4_1, w1);
    1457             : 
    1458           0 :         __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
    1459           0 :         __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
    1460           0 :         __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
    1461             : 
    1462             : //        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
    1463           0 :         __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
    1464             : 
    1465           0 :         z1 = _AVX512_FMA(a1_1, h_3_1, z1);
    1466             : //        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
    1467           0 :         __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
    1468             : 
    1469             : 
    1470             : //        register __AVX512_DATATYPE x1 = a1_1;
    1471           0 :         __AVX512_DATATYPE x1 = a1_1;
    1472             : 
    1473             : 
    1474           0 :         __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
    1475           0 :         __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
    1476           0 :         __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
    1477           0 :         __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
    1478           0 :         __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
    1479           0 :         __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
    1480             : 
    1481             : //        register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
    1482           0 :         __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
    1483             : 
    1484           0 :         t2 = _AVX512_FMA(a4_2, h_6_4, t2);
    1485           0 :         t2 = _AVX512_FMA(a3_2, h_6_3, t2);
    1486           0 :         t2 = _AVX512_FMA(a2_2, h_6_2, t2);
    1487           0 :         t2 = _AVX512_FMA(a1_2, h_6_1, t2);
    1488             : 
    1489             : //        register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
    1490           0 :         __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
    1491             : 
    1492           0 :         v2 = _AVX512_FMA(a3_2, h_5_3, v2);
    1493           0 :         v2 = _AVX512_FMA(a2_2, h_5_2, v2);
    1494           0 :         v2 = _AVX512_FMA(a1_2, h_5_1, v2);
    1495             : 
    1496             : //        register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
    1497           0 :         __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
    1498             : 
    1499           0 :         w2 = _AVX512_FMA(a2_2, h_4_2, w2);
    1500           0 :         w2 = _AVX512_FMA(a1_2, h_4_1, w2);
    1501             : 
    1502             : //        register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
    1503           0 :          __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
    1504             : 
    1505           0 :         z2 = _AVX512_FMA(a1_2, h_3_1, z2);
    1506             : //        register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
    1507           0 :         __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
    1508             : 
    1509             : 
    1510             : //        register __AVX512_DATATYPE x2 = a1_2;
    1511           0 :         __AVX512_DATATYPE x2 = a1_2;
    1512             : 
    1513             : 
    1514           0 :         __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
    1515           0 :         __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
    1516           0 :         __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
    1517           0 :         __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
    1518           0 :         __AVX512_DATATYPE a5_3 = _AVX512_LOAD(&q[(ldq)+2*offset]);
    1519           0 :         __AVX512_DATATYPE a6_3 = _AVX512_LOAD(&q[0+2*offset]);
    1520             : 
    1521             : //        register __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
    1522           0 :         __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
    1523             : 
    1524           0 :         t3 = _AVX512_FMA(a4_3, h_6_4, t3);
    1525           0 :         t3 = _AVX512_FMA(a3_3, h_6_3, t3);
    1526           0 :         t3 = _AVX512_FMA(a2_3, h_6_2, t3);
    1527           0 :         t3 = _AVX512_FMA(a1_3, h_6_1, t3);
    1528             : 
    1529             : //        register __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
    1530           0 :         __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
    1531             : 
    1532           0 :         v3 = _AVX512_FMA(a3_3, h_5_3, v3);
    1533           0 :         v3 = _AVX512_FMA(a2_3, h_5_2, v3);
    1534           0 :         v3 = _AVX512_FMA(a1_3, h_5_1, v3);
    1535             : 
    1536             : //        register __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
    1537           0 :         __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
    1538             : 
    1539           0 :         w3 = _AVX512_FMA(a2_3, h_4_2, w3);
    1540           0 :         w3 = _AVX512_FMA(a1_3, h_4_1, w3);
    1541             : 
    1542             : //        register __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
    1543           0 :         __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
    1544             : 
    1545           0 :         z3 = _AVX512_FMA(a1_3, h_3_1, z3);
    1546             : //        register __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
    1547           0 :         __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
    1548             : 
    1549             : 
    1550             : //        register __AVX512_DATATYPE x3 = a1_3;
    1551           0 :         __AVX512_DATATYPE x3 = a1_3;
    1552             : 
    1553             : 
    1554             :         __AVX512_DATATYPE q1;
    1555             :         __AVX512_DATATYPE q2;
    1556             :         __AVX512_DATATYPE q3;
    1557             : 
    1558             :         __AVX512_DATATYPE h1;
    1559             :         __AVX512_DATATYPE h2;
    1560             :         __AVX512_DATATYPE h3;
    1561             :         __AVX512_DATATYPE h4;
    1562             :         __AVX512_DATATYPE h5;
    1563             :         __AVX512_DATATYPE h6;
    1564             : 
    1565           0 :         for(i = 6; i < nb; i++)
    1566             :         {
    1567           0 :                 h1 = _AVX512_SET1(hh[i-5]);
    1568           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
    1569           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
    1570           0 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
    1571             : 
    1572           0 :                 x1 = _AVX512_FMA(q1, h1, x1);
    1573           0 :                 x2 = _AVX512_FMA(q2, h1, x2);
    1574           0 :                 x3 = _AVX512_FMA(q3, h1, x3);
    1575             : 
    1576           0 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
    1577             : 
    1578           0 :                 y1 = _AVX512_FMA(q1, h2, y1);
    1579           0 :                 y2 = _AVX512_FMA(q2, h2, y2);
    1580           0 :                 y3 = _AVX512_FMA(q3, h2, y3);
    1581             : 
    1582           0 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
    1583             : 
    1584           0 :                 z1 = _AVX512_FMA(q1, h3, z1);
    1585           0 :                 z2 = _AVX512_FMA(q2, h3, z2);
    1586           0 :                 z3 = _AVX512_FMA(q3, h3, z3);
    1587             : 
    1588           0 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
    1589             : 
    1590           0 :                 w1 = _AVX512_FMA(q1, h4, w1);
    1591           0 :                 w2 = _AVX512_FMA(q2, h4, w2);
    1592           0 :                 w3 = _AVX512_FMA(q3, h4, w3);
    1593             : 
    1594           0 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
    1595             : 
    1596           0 :                 v1 = _AVX512_FMA(q1, h5, v1);
    1597           0 :                 v2 = _AVX512_FMA(q2, h5, v2);
    1598           0 :                 v3 = _AVX512_FMA(q3, h5, v3);
    1599             : 
    1600           0 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
    1601             : 
    1602           0 :                 t1 = _AVX512_FMA(q1, h6, t1);
    1603           0 :                 t2 = _AVX512_FMA(q2, h6, t2);
    1604           0 :                 t3 = _AVX512_FMA(q3, h6, t3);
    1605             :         }
    1606             : 
    1607           0 :         h1 = _AVX512_SET1(hh[nb-5]);
    1608           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
    1609           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
    1610           0 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
    1611             : 
    1612           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1613           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1614           0 :         x3 = _AVX512_FMA(q3, h1, x3);
    1615             : 
    1616           0 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
    1617             : 
    1618           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1619           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1620           0 :         y3 = _AVX512_FMA(q3, h2, y3);
    1621             : 
    1622           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
    1623             : 
    1624           0 :         z1 = _AVX512_FMA(q1, h3, z1);
    1625           0 :         z2 = _AVX512_FMA(q2, h3, z2);
    1626           0 :         z3 = _AVX512_FMA(q3, h3, z3);
    1627             : 
    1628           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
    1629             : 
    1630           0 :         w1 = _AVX512_FMA(q1, h4, w1);
    1631           0 :         w2 = _AVX512_FMA(q2, h4, w2);
    1632           0 :         w3 = _AVX512_FMA(q3, h4, w3);
    1633             : 
    1634           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
    1635             : 
    1636           0 :         v1 = _AVX512_FMA(q1, h5, v1);
    1637           0 :         v2 = _AVX512_FMA(q2, h5, v2);
    1638           0 :         v3 = _AVX512_FMA(q3, h5, v3);
    1639             : 
    1640           0 :         h1 = _AVX512_SET1(hh[nb-4]);
    1641             : 
    1642           0 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
    1643           0 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
    1644           0 :         q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
    1645             : 
    1646           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1647           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1648           0 :         x3 = _AVX512_FMA(q3, h1, x3);
    1649             : 
    1650           0 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
    1651             : 
    1652           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1653           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1654           0 :         y3 = _AVX512_FMA(q3, h2, y3);
    1655             : 
    1656           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    1657             : 
    1658           0 :         z1 = _AVX512_FMA(q1, h3, z1);
    1659           0 :         z2 = _AVX512_FMA(q2, h3, z2);
    1660           0 :         z3 = _AVX512_FMA(q3, h3, z3);
    1661             : 
    1662           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    1663             : 
    1664           0 :         w1 = _AVX512_FMA(q1, h4, w1);
    1665           0 :         w2 = _AVX512_FMA(q2, h4, w2);
    1666           0 :         w3 = _AVX512_FMA(q3, h4, w3);
    1667             : 
    1668           0 :         h1 = _AVX512_SET1(hh[nb-3]);
    1669           0 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    1670           0 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    1671           0 :         q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
    1672             : 
    1673           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1674           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1675           0 :         x3 = _AVX512_FMA(q3, h1, x3);
    1676             : 
    1677           0 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    1678             : 
    1679           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1680           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1681           0 :         y3 = _AVX512_FMA(q3, h2, y3);
    1682             : 
    1683           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    1684             : 
    1685           0 :         z1 = _AVX512_FMA(q1, h3, z1);
    1686           0 :         z2 = _AVX512_FMA(q2, h3, z2);
    1687           0 :         z3 = _AVX512_FMA(q3, h3, z3);
    1688             : 
    1689           0 :         h1 = _AVX512_SET1(hh[nb-2]);
    1690           0 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    1691           0 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    1692           0 :         q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
    1693             : 
    1694           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1695           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1696           0 :         x3 = _AVX512_FMA(q3, h1, x3);
    1697             : 
    1698           0 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    1699             : 
    1700           0 :         y1 = _AVX512_FMA(q1, h2, y1);
    1701           0 :         y2 = _AVX512_FMA(q2, h2, y2);
    1702           0 :         y3 = _AVX512_FMA(q3, h2, y3);
    1703             : 
    1704           0 :         h1 = _AVX512_SET1(hh[nb-1]);
    1705           0 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    1706           0 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    1707           0 :         q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
    1708             : 
    1709           0 :         x1 = _AVX512_FMA(q1, h1, x1);
    1710           0 :         x2 = _AVX512_FMA(q2, h1, x2);
    1711           0 :         x3 = _AVX512_FMA(q3, h1, x3);
    1712             : 
    1713             :         /////////////////////////////////////////////////////
    1714             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
    1715             :         /////////////////////////////////////////////////////
    1716             : 
    1717           0 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
    1718           0 :         x1 = _AVX512_MUL(x1, tau1);
    1719           0 :         x2 = _AVX512_MUL(x2, tau1);
    1720           0 :         x3 = _AVX512_MUL(x3, tau1);
    1721             : 
    1722           0 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
    1723           0 :         __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
    1724           0 :         h2 = _AVX512_MUL(tau2, vs_1_2);
    1725             : 
    1726           0 :         y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
    1727           0 :         y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
    1728           0 :         y3 = _AVX512_FMSUB(y3, tau2, _AVX512_MUL(x3,h2));
    1729             : 
    1730           0 :         __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
    1731           0 :         __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
    1732           0 :         __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
    1733             : 
    1734           0 :         h2 = _AVX512_MUL(tau3, vs_1_3);
    1735           0 :         h3 = _AVX512_MUL(tau3, vs_2_3);
    1736             : 
    1737           0 :         z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
    1738           0 :         z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
    1739           0 :         z3 = _AVX512_FMSUB(z3, tau3, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
    1740             : 
    1741           0 :         __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
    1742           0 :         __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
    1743           0 :         __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
    1744             : 
    1745           0 :         h2 = _AVX512_MUL(tau4, vs_1_4);
    1746           0 :         h3 = _AVX512_MUL(tau4, vs_2_4);
    1747             : 
    1748           0 :         __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
    1749           0 :         h4 = _AVX512_MUL(tau4, vs_3_4);
    1750             : 
    1751           0 :         w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    1752           0 :         w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    1753           0 :         w3 = _AVX512_FMSUB(w3, tau4, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
    1754             : 
    1755           0 :         __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
    1756           0 :         __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
    1757           0 :         __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
    1758             : 
    1759           0 :         h2 = _AVX512_MUL(tau5, vs_1_5);
    1760           0 :         h3 = _AVX512_MUL(tau5, vs_2_5);
    1761             : 
    1762           0 :         __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
    1763           0 :         __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
    1764             : 
    1765           0 :         h4 = _AVX512_MUL(tau5, vs_3_5);
    1766           0 :         h5 = _AVX512_MUL(tau5, vs_4_5);
    1767             : 
    1768           0 :         v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    1769           0 :         v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    1770           0 :         v3 = _AVX512_FMSUB(v3, tau5, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
    1771             : 
    1772           0 :         __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
    1773           0 :         __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
    1774           0 :         __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
    1775           0 :         h2 = _AVX512_MUL(tau6, vs_1_6);
    1776           0 :         h3 = _AVX512_MUL(tau6, vs_2_6);
    1777             : 
    1778           0 :         __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
    1779           0 :         __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
    1780           0 :         __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
    1781             : 
    1782           0 :         h4 = _AVX512_MUL(tau6, vs_3_6);
    1783           0 :         h5 = _AVX512_MUL(tau6, vs_4_6);
    1784           0 :         h6 = _AVX512_MUL(tau6, vs_5_6);
    1785             : 
    1786           0 :         t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
    1787           0 :         t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
    1788           0 :         t3 = _AVX512_FMSUB(t3, tau6, _AVX512_FMA(v3, h6, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)))));
    1789             : 
    1790             :         /////////////////////////////////////////////////////
    1791             :         // Rank-1 update of Q [8 x nb+3]
    1792             :         /////////////////////////////////////////////////////
    1793             : 
    1794           0 :         q1 = _AVX512_LOAD(&q[0]);
    1795           0 :         q2 = _AVX512_LOAD(&q[0+offset]);
    1796           0 :         q3 = _AVX512_LOAD(&q[0+2*offset]);
    1797             : 
    1798           0 :         q1 = _AVX512_SUB(q1, t1);
    1799           0 :         q2 = _AVX512_SUB(q2, t2);
    1800           0 :         q3 = _AVX512_SUB(q3, t3);
    1801             : 
    1802             :         _AVX512_STORE(&q[0],q1);
    1803           0 :         _AVX512_STORE(&q[0+offset],q2);
    1804           0 :         _AVX512_STORE(&q[0+2*offset],q3);
    1805             : 
    1806           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+1]);
    1807           0 :         q1 = _AVX512_LOAD(&q[ldq]);
    1808           0 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
    1809           0 :         q3 = _AVX512_LOAD(&q[ldq+2*offset]);
    1810             : 
    1811           0 :         q1 = _AVX512_SUB(q1, v1);
    1812           0 :         q2 = _AVX512_SUB(q2, v2);
    1813           0 :         q3 = _AVX512_SUB(q3, v3);
    1814             : 
    1815           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1816           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1817           0 :         q3 = _AVX512_NFMA(t3, h6, q3);
    1818             : 
    1819           0 :         _AVX512_STORE(&q[ldq],q1);
    1820           0 :         _AVX512_STORE(&q[ldq+offset],q2);
    1821           0 :         _AVX512_STORE(&q[ldq+2*offset],q3);
    1822             : 
    1823           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+1]);
    1824           0 :         q1 = _AVX512_LOAD(&q[ldq*2]);
    1825           0 :         q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
    1826           0 :         q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
    1827             : 
    1828           0 :         q1 = _AVX512_SUB(q1, w1);
    1829           0 :         q2 = _AVX512_SUB(q2, w2);
    1830           0 :         q3 = _AVX512_SUB(q3, w3);
    1831             : 
    1832           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1833           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1834           0 :         q3 = _AVX512_NFMA(v3, h5, q3);
    1835             : 
    1836           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+2]);
    1837             : 
    1838           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1839           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1840           0 :         q3 = _AVX512_NFMA(t3, h6, q3);
    1841             : 
    1842           0 :         _AVX512_STORE(&q[ldq*2],q1);
    1843           0 :         _AVX512_STORE(&q[(ldq*2)+offset],q2);
    1844           0 :         _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
    1845             : 
    1846           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+1]);
    1847           0 :         q1 = _AVX512_LOAD(&q[ldq*3]);
    1848           0 :         q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
    1849           0 :         q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
    1850             : 
    1851           0 :         q1 = _AVX512_SUB(q1, z1);
    1852           0 :         q2 = _AVX512_SUB(q2, z2);
    1853           0 :         q3 = _AVX512_SUB(q3, z3);
    1854             : 
    1855           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1856           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1857           0 :         q3 = _AVX512_NFMA(w3, h4, q3);
    1858             : 
    1859           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+2]);
    1860             : 
    1861           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1862           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1863           0 :         q3 = _AVX512_NFMA(v3, h5, q3);
    1864             : 
    1865           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+3]);
    1866             : 
    1867           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1868           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1869           0 :         q3 = _AVX512_NFMA(t3, h6, q3);
    1870             : 
    1871           0 :         _AVX512_STORE(&q[ldq*3],q1);
    1872           0 :         _AVX512_STORE(&q[(ldq*3)+offset],q2);
    1873           0 :         _AVX512_STORE(&q[(ldq*3)+2*offset],q3);
    1874             : 
    1875           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+1]);
    1876           0 :         q1 = _AVX512_LOAD(&q[ldq*4]);
    1877           0 :         q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
    1878           0 :         q3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
    1879             : 
    1880           0 :         q1 = _AVX512_SUB(q1, y1);
    1881           0 :         q2 = _AVX512_SUB(q2, y2);
    1882           0 :         q3 = _AVX512_SUB(q3, y3);
    1883             : 
    1884           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1885           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1886           0 :         q3 = _AVX512_NFMA(z3, h3, q3);
    1887             : 
    1888           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+2]);
    1889             : 
    1890           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1891           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1892           0 :         q3 = _AVX512_NFMA(w3, h4, q3);
    1893             : 
    1894           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+3]);
    1895             : 
    1896           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1897           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1898           0 :         q3 = _AVX512_NFMA(v3, h5, q3);
    1899             : 
    1900           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+4]);
    1901             : 
    1902           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1903           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1904           0 :         q3 = _AVX512_NFMA(t3, h6, q3);
    1905             : 
    1906           0 :         _AVX512_STORE(&q[ldq*4],q1);
    1907           0 :         _AVX512_STORE(&q[(ldq*4)+offset],q2);
    1908           0 :         _AVX512_STORE(&q[(ldq*4)+2*offset],q3);
    1909             : 
    1910           0 :         h2 = _AVX512_SET1(hh[(ldh)+1]);
    1911           0 :         q1 = _AVX512_LOAD(&q[ldq*5]);
    1912           0 :         q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
    1913           0 :         q3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
    1914             : 
    1915           0 :         q1 = _AVX512_SUB(q1, x1);
    1916           0 :         q2 = _AVX512_SUB(q2, x2);
    1917           0 :         q3 = _AVX512_SUB(q3, x3);
    1918             : 
    1919           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    1920           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    1921           0 :         q3 = _AVX512_NFMA(y3, h2, q3);
    1922             : 
    1923           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+2]);
    1924             : 
    1925           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    1926           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    1927           0 :         q3 = _AVX512_NFMA(z3, h3, q3);
    1928             : 
    1929           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+3]);
    1930             : 
    1931           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    1932           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    1933           0 :         q3 = _AVX512_NFMA(w3, h4, q3);
    1934             : 
    1935           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+4]);
    1936             : 
    1937           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    1938           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    1939           0 :         q3 = _AVX512_NFMA(v3, h5, q3);
    1940             : 
    1941           0 :         h6 = _AVX512_SET1(hh[(ldh*5)+5]);
    1942             : 
    1943           0 :         q1 = _AVX512_NFMA(t1, h6, q1);
    1944           0 :         q2 = _AVX512_NFMA(t2, h6, q2);
    1945           0 :         q3 = _AVX512_NFMA(t3, h6, q3);
    1946             : 
    1947           0 :         _AVX512_STORE(&q[ldq*5],q1);
    1948           0 :         _AVX512_STORE(&q[(ldq*5)+offset],q2);
    1949           0 :         _AVX512_STORE(&q[(ldq*5)+2*offset],q3);
    1950             : 
    1951           0 :         for (i = 6; i < nb; i++)
    1952             :         {
    1953           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
    1954           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
    1955           0 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
    1956             : 
    1957           0 :                 h1 = _AVX512_SET1(hh[i-5]);
    1958             : 
    1959           0 :                 q1 = _AVX512_NFMA(x1, h1, q1);
    1960           0 :                 q2 = _AVX512_NFMA(x2, h1, q2);
    1961           0 :                 q3 = _AVX512_NFMA(x3, h1, q3);
    1962             : 
    1963           0 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
    1964             : 
    1965           0 :                 q1 = _AVX512_NFMA(y1, h2, q1);
    1966           0 :                 q2 = _AVX512_NFMA(y2, h2, q2);
    1967           0 :                 q3 = _AVX512_NFMA(y3, h2, q3);
    1968             : 
    1969           0 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
    1970             : 
    1971           0 :                 q1 = _AVX512_NFMA(z1, h3, q1);
    1972           0 :                 q2 = _AVX512_NFMA(z2, h3, q2);
    1973           0 :                 q3 = _AVX512_NFMA(z3, h3, q3);
    1974             : 
    1975           0 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
    1976             : 
    1977           0 :                 q1 = _AVX512_NFMA(w1, h4, q1);
    1978           0 :                 q2 = _AVX512_NFMA(w2, h4, q2);
    1979           0 :                 q3 = _AVX512_NFMA(w3, h4, q3);
    1980             : 
    1981           0 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
    1982             : 
    1983           0 :                 q1 = _AVX512_NFMA(v1, h5, q1);
    1984           0 :                 q2 = _AVX512_NFMA(v2, h5, q2);
    1985           0 :                 q3 = _AVX512_NFMA(v3, h5, q3);
    1986             : 
    1987           0 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
    1988             : 
    1989           0 :                 q1 = _AVX512_NFMA(t1, h6, q1);
    1990           0 :                 q2 = _AVX512_NFMA(t2, h6, q2);
    1991           0 :                 q3 = _AVX512_NFMA(t3, h6, q3);
    1992             : 
    1993           0 :                 _AVX512_STORE(&q[i*ldq],q1);
    1994           0 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
    1995           0 :                 _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
    1996             : 
    1997             :         }
    1998             : 
    1999           0 :         h1 = _AVX512_SET1(hh[nb-5]);
    2000           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
    2001           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
    2002           0 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
    2003             : 
    2004           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2005           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2006           0 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2007             : 
    2008           0 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
    2009             : 
    2010           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2011           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2012           0 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2013             : 
    2014           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
    2015             : 
    2016           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2017           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2018           0 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2019             : 
    2020           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
    2021             : 
    2022           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2023           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2024           0 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2025             : 
    2026           0 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
    2027             : 
    2028           0 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2029           0 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2030           0 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2031             : 
    2032           0 :         _AVX512_STORE(&q[nb*ldq],q1);
    2033           0 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
    2034           0 :         _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
    2035             : 
    2036           0 :         h1 = _AVX512_SET1(hh[nb-4]);
    2037           0 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
    2038           0 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
    2039           0 :         q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
    2040             : 
    2041           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2042           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2043           0 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2044             : 
    2045           0 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
    2046             : 
    2047           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2048           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2049           0 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2050             : 
    2051           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    2052             : 
    2053           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2054           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2055           0 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2056             : 
    2057           0 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    2058             : 
    2059           0 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2060           0 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2061           0 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2062             : 
    2063           0 :         _AVX512_STORE(&q[(nb+1)*ldq],q1);
    2064           0 :         _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
    2065           0 :         _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
    2066             : 
    2067           0 :         h1 = _AVX512_SET1(hh[nb-3]);
    2068           0 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    2069           0 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    2070           0 :         q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
    2071             : 
    2072           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2073           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2074           0 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2075             : 
    2076           0 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    2077             : 
    2078           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2079           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2080           0 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2081             : 
    2082           0 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    2083             : 
    2084           0 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2085           0 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2086           0 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2087             : 
    2088           0 :         _AVX512_STORE(&q[(nb+2)*ldq],q1);
    2089           0 :         _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
    2090           0 :         _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
    2091             : 
    2092           0 :         h1 = _AVX512_SET1(hh[nb-2]);
    2093           0 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    2094           0 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    2095           0 :         q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
    2096             : 
    2097           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2098           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2099           0 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2100             : 
    2101           0 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    2102             : 
    2103           0 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2104           0 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2105           0 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2106             : 
    2107           0 :         _AVX512_STORE(&q[(nb+3)*ldq],q1);
    2108           0 :         _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
    2109           0 :         _AVX512_STORE(&q[((nb+3)*ldq)+2*offset],q3);
    2110             : 
    2111           0 :         h1 = _AVX512_SET1(hh[nb-1]);
    2112           0 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    2113           0 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    2114           0 :         q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
    2115             : 
    2116           0 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2117           0 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2118           0 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2119             : 
    2120           0 :         _AVX512_STORE(&q[(nb+4)*ldq],q1);
    2121           0 :         _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
    2122           0 :         _AVX512_STORE(&q[((nb+4)*ldq)+2*offset],q3);
    2123             : 
    2124             : }
    2125             : 
    2126             : /**
    2127             :  * Unrolled kernel that computes
    2128             : #ifdef DOUBLE_PRECISION_REAL
    2129             :  * 32 rows of Q simultaneously, a
    2130             : #endif
    2131             : #ifdef SINGLE_PRECISION_REAL
    2132             :  * 64 rows of Q simultaneously, a
    2133             : #endif
    2134             :  * matrix Vector product with two householder
    2135             :  * vectors + a rank 1 update is performed
    2136             :  */
    2137             : #ifdef DOUBLE_PRECISION_REAL
    2138             : __forceinline void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
    2139             : #endif
    2140             : #ifdef SINGLE_PRECISION_REAL
    2141             : __forceinline void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
    2142             : #endif
    2143             : {
    2144             :         /////////////////////////////////////////////////////
    2145             :         // Matrix Vector Multiplication, Q [8 x nb+3] * hh
    2146             :         // hh contains four householder vectors
    2147             :         /////////////////////////////////////////////////////
    2148             :         int i;
    2149             : 
    2150      194560 :         __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
    2151      194560 :         __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
    2152      194560 :         __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
    2153      194560 :         __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
    2154      194560 :         __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
    2155       97280 :         __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
    2156             : 
    2157      194560 :         __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
    2158      194560 :         __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
    2159      194560 :         __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
    2160      194560 :         __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
    2161      194560 :         __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
    2162             : 
    2163             : //        register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
    2164       97280 :         __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
    2165             : 
    2166       97280 :         t1 = _AVX512_FMA(a4_1, h_6_4, t1);
    2167       97280 :         t1 = _AVX512_FMA(a3_1, h_6_3, t1);
    2168       97280 :         t1 = _AVX512_FMA(a2_1, h_6_2, t1);
    2169       97280 :         t1 = _AVX512_FMA(a1_1, h_6_1, t1);
    2170             : 
    2171      194560 :         __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
    2172      194560 :         __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
    2173      194560 :         __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
    2174      194560 :         __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
    2175             : 
    2176             : //        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
    2177       97280 :         __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
    2178             : 
    2179       97280 :         v1 = _AVX512_FMA(a3_1, h_5_3, v1);
    2180       97280 :         v1 = _AVX512_FMA(a2_1, h_5_2, v1);
    2181       97280 :         v1 = _AVX512_FMA(a1_1, h_5_1, v1);
    2182             : 
    2183      194560 :         __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
    2184      194560 :         __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
    2185      194560 :         __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
    2186             : 
    2187             : //        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
    2188       97280 :         __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
    2189             : 
    2190       97280 :         w1 = _AVX512_FMA(a2_1, h_4_2, w1);
    2191       97280 :         w1 = _AVX512_FMA(a1_1, h_4_1, w1);
    2192             : 
    2193      194560 :         __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
    2194      194560 :         __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
    2195      194560 :         __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
    2196             : 
    2197             : //        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
    2198       97280 :         __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
    2199             : 
    2200       97280 :         z1 = _AVX512_FMA(a1_1, h_3_1, z1);
    2201             : //        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
    2202       97280 :         __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
    2203             : 
    2204             : 
    2205             : //        register __AVX512_DATATYPE x1 = a1_1;
    2206       97280 :         __AVX512_DATATYPE x1 = a1_1;
    2207             : 
    2208             : 
    2209             : 
    2210      194560 :         __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
    2211      194560 :         __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
    2212      194560 :         __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
    2213      194560 :         __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
    2214      194560 :         __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
    2215      194560 :         __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
    2216             : 
    2217             : //        register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
    2218       97280 :          __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
    2219             : 
    2220       97280 :         t2 = _AVX512_FMA(a4_2, h_6_4, t2);
    2221       97280 :         t2 = _AVX512_FMA(a3_2, h_6_3, t2);
    2222       97280 :         t2 = _AVX512_FMA(a2_2, h_6_2, t2);
    2223       97280 :         t2 = _AVX512_FMA(a1_2, h_6_1, t2);
    2224             : 
    2225             : //        register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
    2226       97280 :         __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
    2227             : 
    2228       97280 :         v2 = _AVX512_FMA(a3_2, h_5_3, v2);
    2229       97280 :         v2 = _AVX512_FMA(a2_2, h_5_2, v2);
    2230       97280 :         v2 = _AVX512_FMA(a1_2, h_5_1, v2);
    2231             : 
    2232             : //        register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
    2233       97280 :         __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
    2234             : 
    2235       97280 :         w2 = _AVX512_FMA(a2_2, h_4_2, w2);
    2236       97280 :         w2 = _AVX512_FMA(a1_2, h_4_1, w2);
    2237             : 
    2238             : //        register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
    2239       97280 :          __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
    2240             : 
    2241       97280 :         z2 = _AVX512_FMA(a1_2, h_3_1, z2);
    2242             : //        register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
    2243       97280 :         __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
    2244             : 
    2245             : 
    2246             : //        register __AVX512_DATATYPE x2 = a1_2;
    2247       97280 :         __AVX512_DATATYPE x2 = a1_2;
    2248             : 
    2249             : 
    2250      194560 :         __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
    2251      194560 :         __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
    2252      194560 :         __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
    2253      194560 :         __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
    2254      194560 :         __AVX512_DATATYPE a5_3 = _AVX512_LOAD(&q[(ldq)+2*offset]);
    2255      194560 :         __AVX512_DATATYPE a6_3 = _AVX512_LOAD(&q[0+2*offset]);
    2256             : 
    2257             : //        register __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
    2258       97280 :         __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
    2259             : 
    2260       97280 :         t3 = _AVX512_FMA(a4_3, h_6_4, t3);
    2261       97280 :         t3 = _AVX512_FMA(a3_3, h_6_3, t3);
    2262       97280 :         t3 = _AVX512_FMA(a2_3, h_6_2, t3);
    2263       97280 :         t3 = _AVX512_FMA(a1_3, h_6_1, t3);
    2264             : 
    2265             : //        register __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
    2266       97280 :         __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
    2267             : 
    2268       97280 :         v3 = _AVX512_FMA(a3_3, h_5_3, v3);
    2269       97280 :         v3 = _AVX512_FMA(a2_3, h_5_2, v3);
    2270       97280 :         v3 = _AVX512_FMA(a1_3, h_5_1, v3);
    2271             : 
    2272             : //        register __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
    2273       97280 :         __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
    2274             : 
    2275       97280 :         w3 = _AVX512_FMA(a2_3, h_4_2, w3);
    2276       97280 :         w3 = _AVX512_FMA(a1_3, h_4_1, w3);
    2277             : 
    2278             : //        register __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
    2279       97280 :         __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
    2280             : 
    2281       97280 :         z3 = _AVX512_FMA(a1_3, h_3_1, z3);
    2282             : //        register __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
    2283       97280 :         __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
    2284             : 
    2285             : 
    2286             : //        register __AVX512_DATATYPE x3 = a1_3;
    2287       97280 :         __AVX512_DATATYPE x3 = a1_3;
    2288             : 
    2289             : 
    2290      194560 :         __AVX512_DATATYPE a1_4 = _AVX512_LOAD(&q[(ldq*5)+3*offset]);
    2291      194560 :         __AVX512_DATATYPE a2_4 = _AVX512_LOAD(&q[(ldq*4)+3*offset]);
    2292      194560 :         __AVX512_DATATYPE a3_4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
    2293      194560 :         __AVX512_DATATYPE a4_4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
    2294      194560 :         __AVX512_DATATYPE a5_4 = _AVX512_LOAD(&q[(ldq)+3*offset]);
    2295      194560 :         __AVX512_DATATYPE a6_4 = _AVX512_LOAD(&q[0+3*offset]);
    2296             : 
    2297             : //        register __AVX512_DATATYPE t4 = _AVX512_FMA(a5_4, h_6_5, a6_4);
    2298       97280 :         __AVX512_DATATYPE t4 = _AVX512_FMA(a5_4, h_6_5, a6_4);
    2299             : 
    2300       97280 :         t4 = _AVX512_FMA(a4_4, h_6_4, t4);
    2301       97280 :         t4 = _AVX512_FMA(a3_4, h_6_3, t4);
    2302       97280 :         t4 = _AVX512_FMA(a2_4, h_6_2, t4);
    2303       97280 :         t4 = _AVX512_FMA(a1_4, h_6_1, t4);
    2304             : 
    2305             : //        register __AVX512_DATATYPE v4 = _AVX512_FMA(a4_4, h_5_4, a5_4);
    2306       97280 :         __AVX512_DATATYPE v4 = _AVX512_FMA(a4_4, h_5_4, a5_4);
    2307             : 
    2308       97280 :         v4 = _AVX512_FMA(a3_4, h_5_3, v4);
    2309       97280 :         v4 = _AVX512_FMA(a2_4, h_5_2, v4);
    2310       97280 :         v4 = _AVX512_FMA(a1_4, h_5_1, v4);
    2311             : 
    2312             : //        register __AVX512_DATATYPE w4 = _AVX512_FMA(a3_4, h_4_3, a4_4);
    2313       97280 :         __AVX512_DATATYPE w4 = _AVX512_FMA(a3_4, h_4_3, a4_4);
    2314             : 
    2315       97280 :         w4 = _AVX512_FMA(a2_4, h_4_2, w4);
    2316       97280 :         w4 = _AVX512_FMA(a1_4, h_4_1, w4);
    2317             : 
    2318             : //        register __AVX512_DATATYPE z4 = _AVX512_FMA(a2_4, h_3_2, a3_4);
    2319       97280 :         __AVX512_DATATYPE z4 = _AVX512_FMA(a2_4, h_3_2, a3_4);
    2320             : 
    2321       97280 :         z4 = _AVX512_FMA(a1_4, h_3_1, z4);
    2322             : //        register __AVX512_DATATYPE y4 = _AVX512_FMA(a1_4, h_2_1, a2_4);
    2323       97280 :          __AVX512_DATATYPE y4 = _AVX512_FMA(a1_4, h_2_1, a2_4);
    2324             : 
    2325             : 
    2326             : //        register __AVX512_DATATYPE x4 = a1_4;
    2327       97280 :         __AVX512_DATATYPE x4 = a1_4;
    2328             : 
    2329             : 
    2330             :         __AVX512_DATATYPE q1;
    2331             :         __AVX512_DATATYPE q2;
    2332             :         __AVX512_DATATYPE q3;
    2333             :         __AVX512_DATATYPE q4;
    2334             : 
    2335             :         __AVX512_DATATYPE h1;
    2336             :         __AVX512_DATATYPE h2;
    2337             :         __AVX512_DATATYPE h3;
    2338             :         __AVX512_DATATYPE h4;
    2339             :         __AVX512_DATATYPE h5;
    2340             :         __AVX512_DATATYPE h6;
    2341             : 
    2342     5739520 :         for(i = 6; i < nb; i++)
    2343             :         {
    2344    11284480 :                 h1 = _AVX512_SET1(hh[i-5]);
    2345    11284480 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
    2346    11284480 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
    2347    11284480 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
    2348    11284480 :                 q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
    2349             : 
    2350     5642240 :                 x1 = _AVX512_FMA(q1, h1, x1);
    2351     5642240 :                 x2 = _AVX512_FMA(q2, h1, x2);
    2352     5642240 :                 x3 = _AVX512_FMA(q3, h1, x3);
    2353     5642240 :                 x4 = _AVX512_FMA(q4, h1, x4);
    2354             : 
    2355    11284480 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
    2356             : 
    2357     5642240 :                 y1 = _AVX512_FMA(q1, h2, y1);
    2358     5642240 :                 y2 = _AVX512_FMA(q2, h2, y2);
    2359     5642240 :                 y3 = _AVX512_FMA(q3, h2, y3);
    2360     5642240 :                 y4 = _AVX512_FMA(q4, h2, y4);
    2361             : 
    2362    11284480 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
    2363             : 
    2364     5642240 :                 z1 = _AVX512_FMA(q1, h3, z1);
    2365     5642240 :                 z2 = _AVX512_FMA(q2, h3, z2);
    2366     5642240 :                 z3 = _AVX512_FMA(q3, h3, z3);
    2367     5642240 :                 z4 = _AVX512_FMA(q4, h3, z4);
    2368             : 
    2369    11284480 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
    2370             : 
    2371     5642240 :                 w1 = _AVX512_FMA(q1, h4, w1);
    2372     5642240 :                 w2 = _AVX512_FMA(q2, h4, w2);
    2373     5642240 :                 w3 = _AVX512_FMA(q3, h4, w3);
    2374     5642240 :                 w4 = _AVX512_FMA(q4, h4, w4);
    2375             : 
    2376    11284480 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
    2377             : 
    2378     5642240 :                 v1 = _AVX512_FMA(q1, h5, v1);
    2379     5642240 :                 v2 = _AVX512_FMA(q2, h5, v2);
    2380     5642240 :                 v3 = _AVX512_FMA(q3, h5, v3);
    2381     5642240 :                 v4 = _AVX512_FMA(q4, h5, v4);
    2382             : 
    2383    11284480 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
    2384             : 
    2385     5642240 :                 t1 = _AVX512_FMA(q1, h6, t1);
    2386     5642240 :                 t2 = _AVX512_FMA(q2, h6, t2);
    2387     5642240 :                 t3 = _AVX512_FMA(q3, h6, t3);
    2388     5642240 :                 t4 = _AVX512_FMA(q4, h6, t4);
    2389             :         }
    2390             : 
    2391      194560 :         h1 = _AVX512_SET1(hh[nb-5]);
    2392      194560 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
    2393      194560 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
    2394      194560 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
    2395      194560 :         q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
    2396             : 
    2397       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
    2398       97280 :         x2 = _AVX512_FMA(q2, h1, x2);
    2399       97280 :         x3 = _AVX512_FMA(q3, h1, x3);
    2400       97280 :         x4 = _AVX512_FMA(q4, h1, x4);
    2401             : 
    2402      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
    2403             : 
    2404       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
    2405       97280 :         y2 = _AVX512_FMA(q2, h2, y2);
    2406       97280 :         y3 = _AVX512_FMA(q3, h2, y3);
    2407       97280 :         y4 = _AVX512_FMA(q4, h2, y4);
    2408             : 
    2409      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
    2410             : 
    2411       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
    2412       97280 :         z2 = _AVX512_FMA(q2, h3, z2);
    2413       97280 :         z3 = _AVX512_FMA(q3, h3, z3);
    2414       97280 :         z4 = _AVX512_FMA(q4, h3, z4);
    2415             : 
    2416      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
    2417             : 
    2418       97280 :         w1 = _AVX512_FMA(q1, h4, w1);
    2419       97280 :         w2 = _AVX512_FMA(q2, h4, w2);
    2420       97280 :         w3 = _AVX512_FMA(q3, h4, w3);
    2421       97280 :         w4 = _AVX512_FMA(q4, h4, w4);
    2422             : 
    2423      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
    2424             : 
    2425       97280 :         v1 = _AVX512_FMA(q1, h5, v1);
    2426       97280 :         v2 = _AVX512_FMA(q2, h5, v2);
    2427       97280 :         v3 = _AVX512_FMA(q3, h5, v3);
    2428       97280 :         v4 = _AVX512_FMA(q4, h5, v4);
    2429             : 
    2430      194560 :         h1 = _AVX512_SET1(hh[nb-4]);
    2431             : 
    2432      194560 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
    2433      194560 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
    2434      194560 :         q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
    2435      194560 :         q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
    2436             : 
    2437       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
    2438       97280 :         x2 = _AVX512_FMA(q2, h1, x2);
    2439       97280 :         x3 = _AVX512_FMA(q3, h1, x3);
    2440       97280 :         x4 = _AVX512_FMA(q4, h1, x4);
    2441             : 
    2442      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
    2443             : 
    2444       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
    2445       97280 :         y2 = _AVX512_FMA(q2, h2, y2);
    2446       97280 :         y3 = _AVX512_FMA(q3, h2, y3);
    2447       97280 :         y4 = _AVX512_FMA(q4, h2, y4);
    2448             : 
    2449      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    2450             : 
    2451       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
    2452       97280 :         z2 = _AVX512_FMA(q2, h3, z2);
    2453       97280 :         z3 = _AVX512_FMA(q3, h3, z3);
    2454       97280 :         z4 = _AVX512_FMA(q4, h3, z4);
    2455             : 
    2456      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    2457             : 
    2458       97280 :         w1 = _AVX512_FMA(q1, h4, w1);
    2459       97280 :         w2 = _AVX512_FMA(q2, h4, w2);
    2460       97280 :         w3 = _AVX512_FMA(q3, h4, w3);
    2461       97280 :         w4 = _AVX512_FMA(q4, h4, w4);
    2462             : 
    2463      194560 :         h1 = _AVX512_SET1(hh[nb-3]);
    2464      194560 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    2465      194560 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    2466      194560 :         q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
    2467      194560 :         q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
    2468             : 
    2469       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
    2470       97280 :         x2 = _AVX512_FMA(q2, h1, x2);
    2471       97280 :         x3 = _AVX512_FMA(q3, h1, x3);
    2472       97280 :         x4 = _AVX512_FMA(q4, h1, x4);
    2473             : 
    2474      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    2475             : 
    2476       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
    2477       97280 :         y2 = _AVX512_FMA(q2, h2, y2);
    2478       97280 :         y3 = _AVX512_FMA(q3, h2, y3);
    2479       97280 :         y4 = _AVX512_FMA(q4, h2, y4);
    2480             : 
    2481      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    2482             : 
    2483       97280 :         z1 = _AVX512_FMA(q1, h3, z1);
    2484       97280 :         z2 = _AVX512_FMA(q2, h3, z2);
    2485       97280 :         z3 = _AVX512_FMA(q3, h3, z3);
    2486       97280 :         z4 = _AVX512_FMA(q4, h3, z4);
    2487             : 
    2488      194560 :         h1 = _AVX512_SET1(hh[nb-2]);
    2489      194560 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    2490      194560 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    2491      194560 :         q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
    2492      194560 :         q4 = _AVX512_LOAD(&q[((nb+3)*ldq)+3*offset]);
    2493             : 
    2494       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
    2495       97280 :         x2 = _AVX512_FMA(q2, h1, x2);
    2496       97280 :         x3 = _AVX512_FMA(q3, h1, x3);
    2497       97280 :         x4 = _AVX512_FMA(q4, h1, x4);
    2498             : 
    2499      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    2500             : 
    2501       97280 :         y1 = _AVX512_FMA(q1, h2, y1);
    2502       97280 :         y2 = _AVX512_FMA(q2, h2, y2);
    2503       97280 :         y3 = _AVX512_FMA(q3, h2, y3);
    2504       97280 :         y4 = _AVX512_FMA(q4, h2, y4);
    2505             : 
    2506      194560 :         h1 = _AVX512_SET1(hh[nb-1]);
    2507      194560 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    2508      194560 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    2509      194560 :         q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
    2510      194560 :         q4 = _AVX512_LOAD(&q[((nb+4)*ldq)+3*offset]);
    2511             : 
    2512       97280 :         x1 = _AVX512_FMA(q1, h1, x1);
    2513       97280 :         x2 = _AVX512_FMA(q2, h1, x2);
    2514       97280 :         x3 = _AVX512_FMA(q3, h1, x3);
    2515       97280 :         x4 = _AVX512_FMA(q4, h1, x4);
    2516             : 
    2517             :         /////////////////////////////////////////////////////
    2518             :         // Apply tau, correct wrong calculation using pre-calculated scalar products
    2519             :         /////////////////////////////////////////////////////
    2520             : 
    2521      194560 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
    2522       97280 :         x1 = _AVX512_MUL(x1, tau1);
    2523       97280 :         x2 = _AVX512_MUL(x2, tau1);
    2524       97280 :         x3 = _AVX512_MUL(x3, tau1);
    2525       97280 :         x4 = _AVX512_MUL(x4, tau1);
    2526             : 
    2527      194560 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
    2528      194560 :         __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
    2529       97280 :         h2 = _AVX512_MUL(tau2, vs_1_2);
    2530             : 
    2531      194560 :         y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
    2532      194560 :         y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
    2533      194560 :         y3 = _AVX512_FMSUB(y3, tau2, _AVX512_MUL(x3,h2));
    2534      194560 :         y4 = _AVX512_FMSUB(y4, tau2, _AVX512_MUL(x4,h2));
    2535             : 
    2536      194560 :         __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
    2537      194560 :         __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
    2538      194560 :         __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
    2539             : 
    2540       97280 :         h2 = _AVX512_MUL(tau3, vs_1_3);
    2541       97280 :         h3 = _AVX512_MUL(tau3, vs_2_3);
    2542             : 
    2543      291840 :         z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
    2544      291840 :         z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
    2545      291840 :         z3 = _AVX512_FMSUB(z3, tau3, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
    2546      291840 :         z4 = _AVX512_FMSUB(z4, tau3, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2)));
    2547             : 
    2548      194560 :         __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
    2549      194560 :         __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
    2550      194560 :         __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
    2551             : 
    2552       97280 :         h2 = _AVX512_MUL(tau4, vs_1_4);
    2553       97280 :         h3 = _AVX512_MUL(tau4, vs_2_4);
    2554             : 
    2555      194560 :         __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
    2556       97280 :         h4 = _AVX512_MUL(tau4, vs_3_4);
    2557             : 
    2558      389120 :         w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    2559      389120 :         w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    2560      389120 :         w3 = _AVX512_FMSUB(w3, tau4, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
    2561      389120 :         w4 = _AVX512_FMSUB(w4, tau4, _AVX512_FMA(z4, h4, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2))));
    2562             : 
    2563      194560 :         __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
    2564      194560 :         __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
    2565      194560 :         __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
    2566             : 
    2567       97280 :         h2 = _AVX512_MUL(tau5, vs_1_5);
    2568       97280 :         h3 = _AVX512_MUL(tau5, vs_2_5);
    2569             : 
    2570      194560 :         __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
    2571      194560 :         __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
    2572             : 
    2573       97280 :         h4 = _AVX512_MUL(tau5, vs_3_5);
    2574       97280 :         h5 = _AVX512_MUL(tau5, vs_4_5);
    2575             : 
    2576      583680 :         v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
    2577      583680 :         v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
    2578      583680 :         v3 = _AVX512_FMSUB(v3, tau5, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
    2579      583680 :         v4 = _AVX512_FMSUB(v4, tau5, _AVX512_ADD(_AVX512_FMA(w4, h5, _AVX512_MUL(z4,h4)), _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2))));
    2580             : 
    2581      194560 :         __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
    2582      194560 :         __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
    2583      194560 :         __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
    2584       97280 :         h2 = _AVX512_MUL(tau6, vs_1_6);
    2585       97280 :         h3 = _AVX512_MUL(tau6, vs_2_6);
    2586             : 
    2587      194560 :         __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
    2588      194560 :         __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
    2589      194560 :         __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
    2590             : 
    2591       97280 :         h4 = _AVX512_MUL(tau6, vs_3_6);
    2592       97280 :         h5 = _AVX512_MUL(tau6, vs_4_6);
    2593       97280 :         h6 = _AVX512_MUL(tau6, vs_5_6);
    2594             : 
    2595      680960 :         t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
    2596      680960 :         t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
    2597      680960 :         t3 = _AVX512_FMSUB(t3, tau6, _AVX512_FMA(v3, h6, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)))));
    2598      680960 :         t4 = _AVX512_FMSUB(t4, tau6, _AVX512_FMA(v4, h6, _AVX512_ADD(_AVX512_FMA(w4, h5, _AVX512_MUL(z4,h4)), _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2)))));
    2599             : 
    2600             : 
    2601             :         /////////////////////////////////////////////////////
    2602             :         // Rank-1 update of Q [8 x nb+3]
    2603             :         /////////////////////////////////////////////////////
    2604             : 
    2605       97280 :         q1 = _AVX512_LOAD(&q[0]);
    2606      194560 :         q2 = _AVX512_LOAD(&q[0+offset]);
    2607      194560 :         q3 = _AVX512_LOAD(&q[0+2*offset]);
    2608      194560 :         q4 = _AVX512_LOAD(&q[0+3*offset]);
    2609             : 
    2610       97280 :         q1 = _AVX512_SUB(q1, t1);
    2611       97280 :         q2 = _AVX512_SUB(q2, t2);
    2612       97280 :         q3 = _AVX512_SUB(q3, t3);
    2613       97280 :         q4 = _AVX512_SUB(q4, t4);
    2614             : 
    2615             :         _AVX512_STORE(&q[0],q1);
    2616       97280 :         _AVX512_STORE(&q[0+offset],q2);
    2617       97280 :         _AVX512_STORE(&q[0+2*offset],q3);
    2618       97280 :         _AVX512_STORE(&q[0+3*offset],q4);
    2619             : 
    2620      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+1]);
    2621      194560 :         q1 = _AVX512_LOAD(&q[ldq]);
    2622      194560 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
    2623      194560 :         q3 = _AVX512_LOAD(&q[ldq+2*offset]);
    2624      194560 :         q4 = _AVX512_LOAD(&q[ldq+3*offset]);
    2625             : 
    2626       97280 :         q1 = _AVX512_SUB(q1, v1);
    2627       97280 :         q2 = _AVX512_SUB(q2, v2);
    2628       97280 :         q3 = _AVX512_SUB(q3, v3);
    2629       97280 :         q4 = _AVX512_SUB(q4, v4);
    2630             : 
    2631       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
    2632       97280 :         q2 = _AVX512_NFMA(t2, h6, q2);
    2633       97280 :         q3 = _AVX512_NFMA(t3, h6, q3);
    2634       97280 :         q4 = _AVX512_NFMA(t4, h6, q4);
    2635             : 
    2636       97280 :         _AVX512_STORE(&q[ldq],q1);
    2637       97280 :         _AVX512_STORE(&q[ldq+offset],q2);
    2638       97280 :         _AVX512_STORE(&q[ldq+2*offset],q3);
    2639       97280 :         _AVX512_STORE(&q[ldq+3*offset],q4);
    2640             : 
    2641      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+1]);
    2642      194560 :         q1 = _AVX512_LOAD(&q[ldq*2]);
    2643      194560 :         q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
    2644      194560 :         q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
    2645      194560 :         q4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
    2646             : 
    2647       97280 :         q1 = _AVX512_SUB(q1, w1);
    2648       97280 :         q2 = _AVX512_SUB(q2, w2);
    2649       97280 :         q3 = _AVX512_SUB(q3, w3);
    2650       97280 :         q4 = _AVX512_SUB(q4, w4);
    2651             : 
    2652       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2653       97280 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2654       97280 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2655       97280 :         q4 = _AVX512_NFMA(v4, h5, q4);
    2656             : 
    2657      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+2]);
    2658             : 
    2659       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
    2660       97280 :         q2 = _AVX512_NFMA(t2, h6, q2);
    2661       97280 :         q3 = _AVX512_NFMA(t3, h6, q3);
    2662       97280 :         q4 = _AVX512_NFMA(t4, h6, q4);
    2663             : 
    2664       97280 :         _AVX512_STORE(&q[ldq*2],q1);
    2665       97280 :         _AVX512_STORE(&q[(ldq*2)+offset],q2);
    2666       97280 :         _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
    2667       97280 :         _AVX512_STORE(&q[(ldq*2)+3*offset],q4);
    2668             : 
    2669      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+1]);
    2670      194560 :         q1 = _AVX512_LOAD(&q[ldq*3]);
    2671      194560 :         q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
    2672      194560 :         q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
    2673      194560 :         q4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
    2674             : 
    2675       97280 :         q1 = _AVX512_SUB(q1, z1);
    2676       97280 :         q2 = _AVX512_SUB(q2, z2);
    2677       97280 :         q3 = _AVX512_SUB(q3, z3);
    2678       97280 :         q4 = _AVX512_SUB(q4, z4);
    2679             : 
    2680       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2681       97280 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2682       97280 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2683       97280 :         q4 = _AVX512_NFMA(w4, h4, q4);
    2684             : 
    2685      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+2]);
    2686             : 
    2687       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2688       97280 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2689       97280 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2690       97280 :         q4 = _AVX512_NFMA(v4, h5, q4);
    2691             : 
    2692      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+3]);
    2693             : 
    2694       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
    2695       97280 :         q2 = _AVX512_NFMA(t2, h6, q2);
    2696       97280 :         q3 = _AVX512_NFMA(t3, h6, q3);
    2697       97280 :         q4 = _AVX512_NFMA(t4, h6, q4);
    2698             : 
    2699       97280 :         _AVX512_STORE(&q[ldq*3],q1);
    2700       97280 :         _AVX512_STORE(&q[(ldq*3)+offset],q2);
    2701       97280 :         _AVX512_STORE(&q[(ldq*3)+2*offset],q3);
    2702       97280 :         _AVX512_STORE(&q[(ldq*3)+3*offset],q4);
    2703             : 
    2704      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+1]);
    2705      194560 :         q1 = _AVX512_LOAD(&q[ldq*4]);
    2706      194560 :         q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
    2707      194560 :         q3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
    2708      194560 :         q4 = _AVX512_LOAD(&q[(ldq*4)+3*offset]);
    2709             : 
    2710       97280 :         q1 = _AVX512_SUB(q1, y1);
    2711       97280 :         q2 = _AVX512_SUB(q2, y2);
    2712       97280 :         q3 = _AVX512_SUB(q3, y3);
    2713       97280 :         q4 = _AVX512_SUB(q4, y4);
    2714             : 
    2715       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2716       97280 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2717       97280 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2718       97280 :         q4 = _AVX512_NFMA(z4, h3, q4);
    2719             : 
    2720      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+2]);
    2721             : 
    2722       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2723       97280 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2724       97280 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2725       97280 :         q4 = _AVX512_NFMA(w4, h4, q4);
    2726             : 
    2727      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+3]);
    2728             : 
    2729       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2730       97280 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2731       97280 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2732       97280 :         q4 = _AVX512_NFMA(v4, h5, q4);
    2733             : 
    2734      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+4]);
    2735             : 
    2736       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
    2737       97280 :         q2 = _AVX512_NFMA(t2, h6, q2);
    2738       97280 :         q3 = _AVX512_NFMA(t3, h6, q3);
    2739       97280 :         q4 = _AVX512_NFMA(t4, h6, q4);
    2740             : 
    2741       97280 :         _AVX512_STORE(&q[ldq*4],q1);
    2742       97280 :         _AVX512_STORE(&q[(ldq*4)+offset],q2);
    2743       97280 :         _AVX512_STORE(&q[(ldq*4)+2*offset],q3);
    2744       97280 :         _AVX512_STORE(&q[(ldq*4)+3*offset],q4);
    2745             : 
    2746      194560 :         h2 = _AVX512_SET1(hh[(ldh)+1]);
    2747      194560 :         q1 = _AVX512_LOAD(&q[ldq*5]);
    2748      194560 :         q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
    2749      194560 :         q3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
    2750      194560 :         q4 = _AVX512_LOAD(&q[(ldq*5)+3*offset]);
    2751             : 
    2752       97280 :         q1 = _AVX512_SUB(q1, x1);
    2753       97280 :         q2 = _AVX512_SUB(q2, x2);
    2754       97280 :         q3 = _AVX512_SUB(q3, x3);
    2755       97280 :         q4 = _AVX512_SUB(q4, x4);
    2756             : 
    2757       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2758       97280 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2759       97280 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2760       97280 :         q4 = _AVX512_NFMA(y4, h2, q4);
    2761             : 
    2762      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+2]);
    2763             : 
    2764       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2765       97280 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2766       97280 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2767       97280 :         q4 = _AVX512_NFMA(z4, h3, q4);
    2768             : 
    2769      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+3]);
    2770             : 
    2771       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2772       97280 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2773       97280 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2774       97280 :         q4 = _AVX512_NFMA(w4, h4, q4);
    2775             : 
    2776      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+4]);
    2777             : 
    2778       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2779       97280 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2780       97280 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2781       97280 :         q4 = _AVX512_NFMA(v4, h5, q4);
    2782             : 
    2783      194560 :         h6 = _AVX512_SET1(hh[(ldh*5)+5]);
    2784             : 
    2785       97280 :         q1 = _AVX512_NFMA(t1, h6, q1);
    2786       97280 :         q2 = _AVX512_NFMA(t2, h6, q2);
    2787       97280 :         q3 = _AVX512_NFMA(t3, h6, q3);
    2788       97280 :         q4 = _AVX512_NFMA(t4, h6, q4);
    2789             : 
    2790       97280 :         _AVX512_STORE(&q[ldq*5],q1);
    2791       97280 :         _AVX512_STORE(&q[(ldq*5)+offset],q2);
    2792       97280 :         _AVX512_STORE(&q[(ldq*5)+2*offset],q3);
    2793       97280 :         _AVX512_STORE(&q[(ldq*5)+3*offset],q4);
    2794             : 
    2795     5739520 :         for (i = 6; i < nb; i++)
    2796             :         {
    2797    11284480 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
    2798    11284480 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
    2799    11284480 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
    2800    11284480 :                 q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
    2801             : 
    2802    11284480 :                 h1 = _AVX512_SET1(hh[i-5]);
    2803             : 
    2804     5642240 :                 q1 = _AVX512_NFMA(x1, h1, q1);
    2805     5642240 :                 q2 = _AVX512_NFMA(x2, h1, q2);
    2806     5642240 :                 q3 = _AVX512_NFMA(x3, h1, q3);
    2807     5642240 :                 q4 = _AVX512_NFMA(x4, h1, q4);
    2808             : 
    2809    11284480 :                 h2 = _AVX512_SET1(hh[ldh+i-4]);
    2810             : 
    2811     5642240 :                 q1 = _AVX512_NFMA(y1, h2, q1);
    2812     5642240 :                 q2 = _AVX512_NFMA(y2, h2, q2);
    2813     5642240 :                 q3 = _AVX512_NFMA(y3, h2, q3);
    2814     5642240 :                 q4 = _AVX512_NFMA(y4, h2, q4);
    2815             : 
    2816    11284480 :                 h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
    2817             : 
    2818     5642240 :                 q1 = _AVX512_NFMA(z1, h3, q1);
    2819     5642240 :                 q2 = _AVX512_NFMA(z2, h3, q2);
    2820     5642240 :                 q3 = _AVX512_NFMA(z3, h3, q3);
    2821     5642240 :                 q4 = _AVX512_NFMA(z4, h3, q4);
    2822             : 
    2823    11284480 :                 h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
    2824             : 
    2825     5642240 :                 q1 = _AVX512_NFMA(w1, h4, q1);
    2826     5642240 :                 q2 = _AVX512_NFMA(w2, h4, q2);
    2827     5642240 :                 q3 = _AVX512_NFMA(w3, h4, q3);
    2828     5642240 :                 q4 = _AVX512_NFMA(w4, h4, q4);
    2829             : 
    2830    11284480 :                 h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
    2831             : 
    2832     5642240 :                 q1 = _AVX512_NFMA(v1, h5, q1);
    2833     5642240 :                 q2 = _AVX512_NFMA(v2, h5, q2);
    2834     5642240 :                 q3 = _AVX512_NFMA(v3, h5, q3);
    2835     5642240 :                 q4 = _AVX512_NFMA(v4, h5, q4);
    2836             : 
    2837    11284480 :                 h6 = _AVX512_SET1(hh[(ldh*5)+i]);
    2838             : 
    2839     5642240 :                 q1 = _AVX512_NFMA(t1, h6, q1);
    2840     5642240 :                 q2 = _AVX512_NFMA(t2, h6, q2);
    2841     5642240 :                 q3 = _AVX512_NFMA(t3, h6, q3);
    2842     5642240 :                 q4 = _AVX512_NFMA(t4, h6, q4);
    2843             : 
    2844     5642240 :                 _AVX512_STORE(&q[i*ldq],q1);
    2845     5642240 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
    2846     5642240 :                 _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
    2847     5642240 :                 _AVX512_STORE(&q[(i*ldq)+3*offset],q4);
    2848             : 
    2849             :         }
    2850             : 
    2851      194560 :         h1 = _AVX512_SET1(hh[nb-5]);
    2852      194560 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
    2853      194560 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
    2854      194560 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
    2855      194560 :         q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
    2856             : 
    2857       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2858       97280 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2859       97280 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2860       97280 :         q4 = _AVX512_NFMA(x4, h1, q4);
    2861             : 
    2862      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-4]);
    2863             : 
    2864       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2865       97280 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2866       97280 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2867       97280 :         q4 = _AVX512_NFMA(y4, h2, q4);
    2868             : 
    2869      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
    2870             : 
    2871       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2872       97280 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2873       97280 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2874       97280 :         q4 = _AVX512_NFMA(z4, h3, q4);
    2875             : 
    2876      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
    2877             : 
    2878       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2879       97280 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2880       97280 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2881       97280 :         q4 = _AVX512_NFMA(w4, h4, q4);
    2882             : 
    2883      194560 :         h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
    2884             : 
    2885       97280 :         q1 = _AVX512_NFMA(v1, h5, q1);
    2886       97280 :         q2 = _AVX512_NFMA(v2, h5, q2);
    2887       97280 :         q3 = _AVX512_NFMA(v3, h5, q3);
    2888       97280 :         q4 = _AVX512_NFMA(v4, h5, q4);
    2889             : 
    2890       97280 :         _AVX512_STORE(&q[nb*ldq],q1);
    2891       97280 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
    2892       97280 :         _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
    2893       97280 :         _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
    2894             : 
    2895      194560 :         h1 = _AVX512_SET1(hh[nb-4]);
    2896      194560 :         q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
    2897      194560 :         q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
    2898      194560 :         q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
    2899      194560 :         q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
    2900             : 
    2901       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2902       97280 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2903       97280 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2904       97280 :         q4 = _AVX512_NFMA(x4, h1, q4);
    2905             : 
    2906      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-3]);
    2907             : 
    2908       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2909       97280 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2910       97280 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2911       97280 :         q4 = _AVX512_NFMA(y4, h2, q4);
    2912             : 
    2913      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
    2914             : 
    2915       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2916       97280 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2917       97280 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2918       97280 :         q4 = _AVX512_NFMA(z4, h3, q4);
    2919             : 
    2920      194560 :         h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
    2921             : 
    2922       97280 :         q1 = _AVX512_NFMA(w1, h4, q1);
    2923       97280 :         q2 = _AVX512_NFMA(w2, h4, q2);
    2924       97280 :         q3 = _AVX512_NFMA(w3, h4, q3);
    2925       97280 :         q4 = _AVX512_NFMA(w4, h4, q4);
    2926             : 
    2927       97280 :         _AVX512_STORE(&q[(nb+1)*ldq],q1);
    2928       97280 :         _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
    2929       97280 :         _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
    2930       97280 :         _AVX512_STORE(&q[((nb+1)*ldq)+3*offset],q4);
    2931             : 
    2932      194560 :         h1 = _AVX512_SET1(hh[nb-3]);
    2933      194560 :         q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
    2934      194560 :         q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
    2935      194560 :         q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
    2936      194560 :         q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
    2937             : 
    2938       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2939       97280 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2940       97280 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2941       97280 :         q4 = _AVX512_NFMA(x4, h1, q4);
    2942             : 
    2943      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-2]);
    2944             : 
    2945       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2946       97280 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2947       97280 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2948       97280 :         q4 = _AVX512_NFMA(y4, h2, q4);
    2949             : 
    2950      194560 :         h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
    2951             : 
    2952       97280 :         q1 = _AVX512_NFMA(z1, h3, q1);
    2953       97280 :         q2 = _AVX512_NFMA(z2, h3, q2);
    2954       97280 :         q3 = _AVX512_NFMA(z3, h3, q3);
    2955       97280 :         q4 = _AVX512_NFMA(z4, h3, q4);
    2956             : 
    2957       97280 :         _AVX512_STORE(&q[(nb+2)*ldq],q1);
    2958       97280 :         _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
    2959       97280 :         _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
    2960       97280 :         _AVX512_STORE(&q[((nb+2)*ldq)+3*offset],q4);
    2961             : 
    2962      194560 :         h1 = _AVX512_SET1(hh[nb-2]);
    2963      194560 :         q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
    2964      194560 :         q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
    2965      194560 :         q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
    2966      194560 :         q4 = _AVX512_LOAD(&q[((nb+3)*ldq)+3*offset]);
    2967             : 
    2968       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2969       97280 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2970       97280 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2971       97280 :         q4 = _AVX512_NFMA(x4, h1, q4);
    2972             : 
    2973      194560 :         h2 = _AVX512_SET1(hh[ldh+nb-1]);
    2974             : 
    2975       97280 :         q1 = _AVX512_NFMA(y1, h2, q1);
    2976       97280 :         q2 = _AVX512_NFMA(y2, h2, q2);
    2977       97280 :         q3 = _AVX512_NFMA(y3, h2, q3);
    2978       97280 :         q4 = _AVX512_NFMA(y4, h2, q4);
    2979             : 
    2980       97280 :         _AVX512_STORE(&q[(nb+3)*ldq],q1);
    2981       97280 :         _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
    2982       97280 :         _AVX512_STORE(&q[((nb+3)*ldq)+2*offset],q3);
    2983       97280 :         _AVX512_STORE(&q[((nb+3)*ldq)+3*offset],q4);
    2984             : 
    2985      194560 :         h1 = _AVX512_SET1(hh[nb-1]);
    2986      194560 :         q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
    2987      194560 :         q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
    2988      194560 :         q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
    2989      194560 :         q4 = _AVX512_LOAD(&q[((nb+4)*ldq)+3*offset]);
    2990             : 
    2991       97280 :         q1 = _AVX512_NFMA(x1, h1, q1);
    2992       97280 :         q2 = _AVX512_NFMA(x2, h1, q2);
    2993       97280 :         q3 = _AVX512_NFMA(x3, h1, q3);
    2994       97280 :         q4 = _AVX512_NFMA(x4, h1, q4);
    2995             : 
    2996       97280 :         _AVX512_STORE(&q[(nb+4)*ldq],q1);
    2997       97280 :         _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
    2998       97280 :         _AVX512_STORE(&q[((nb+4)*ldq)+2*offset],q3);
    2999       97280 :         _AVX512_STORE(&q[((nb+4)*ldq)+3*offset],q4);
    3000             : 
    3001             : }
    3002             : 

Generated by: LCOV version 1.12