LCOV - code coverage report
Current view: top level - src/elpa2/kernels - real_avx512_2hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 250 343 72.9 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
      46             : // --------------------------------------------------------------------------------------------------
      47             : 
      48             : #include "config-f90.h"
      49             : 
      50             : #include <x86intrin.h>
      51             : #include <stdio.h>
      52             : #include <stdlib.h>
      53             : 
      54             : #define __forceinline __attribute__((always_inline)) static
      55             : 
      56             : #ifdef DOUBLE_PRECISION_REAL
      57             : #define offset 8
      58             : 
      59             : #define __AVX512_DATATYPE __m512d
      60             : #define __AVX512i __m512i
      61             : #define _AVX512_LOAD  _mm512_load_pd
      62             : #define _AVX512_STORE  _mm512_store_pd
      63             : #define _AVX512_SET1 _mm512_set1_pd
      64             : #define _AVX512_ADD _mm512_add_pd
      65             : #define _AVX512_MUL _mm512_mul_pd
      66             : 
      67             : #ifdef HAVE_AVX512
      68             : #define __ELPA_USE_FMA__
      69             : #define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
      70             : #endif
      71             : 
      72             : #define _AVX512_FMA _mm512_FMA_pd
      73             : #endif /* DOUBLE_PRECISION_REAL */
      74             : 
      75             : #ifdef SINGLE_PRECISION_REAL
      76             : #define offset 16
      77             : 
      78             : #define __AVX512_DATATYPE __m512
      79             : #define __AVX512i __m512i
      80             : #define _AVX512_LOAD  _mm512_load_ps
      81             : #define _AVX512_STORE  _mm512_store_ps
      82             : #define _AVX512_SET1 _mm512_set1_ps
      83             : #define _AVX512_ADD _mm512_add_ps
      84             : #define _AVX512_MUL _mm512_mul_ps
      85             : 
      86             : #ifdef HAVE_AVX512
      87             : #define __ELPA_USE_FMA__
      88             : #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
      89             : #endif
      90             : 
      91             : #define _AVX512_FMA _mm512_FMA_ps
      92             : #endif /* SINGLE_PRECISION_REAL */
      93             : 
      94             : #ifdef DOUBLE_PRECISION_REAL
      95             : //Forward declaration
      96             : __forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
      97             : __forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
      98             : __forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
      99             : __forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
     100             : 
     101             : void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     102             : #endif
     103             : #ifdef SINGLE_PRECISION_REAL
     104             : //Forward declaration
     105             : __forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     106             : __forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     107             : __forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     108             : __forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
     109             : 
     110             : void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
     111             : #endif
     112             : 
     113             : #ifdef DOUBLE_PRECISION_REAL
     114             : /*
     115             : !f>#if defined(HAVE_AVX512)
     116             : !f> interface
     117             : !f>   subroutine double_hh_trafo_real_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
     118             : !f>                             bind(C, name="double_hh_trafo_real_avx512_2hv_double")
     119             : !f>     use, intrinsic :: iso_c_binding
     120             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     121             : !f>     type(c_ptr), value      :: q
     122             : !f>     real(kind=c_double)     :: hh(pnb,6)
     123             : !f>   end subroutine
     124             : !f> end interface
     125             : !f>#endif
     126             : */
     127             : #endif
     128             : #ifdef SINGLE_PRECISION_REAL
     129             : /*
     130             : !f>#if defined(HAVE_AVX512)
     131             : !f> interface
     132             : !f>   subroutine double_hh_trafo_real_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
     133             : !f>                             bind(C, name="double_hh_trafo_real_avx512_2hv_single")
     134             : !f>     use, intrinsic :: iso_c_binding
     135             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     136             : !f>     type(c_ptr), value      :: q
     137             : !f>     real(kind=c_float)      :: hh(pnb,6)
     138             : !f>   end subroutine
     139             : !f> end interface
     140             : !f>#endif
     141             : */
     142             : #endif
     143             : 
     144             : #ifdef DOUBLE_PRECISION_REAL
     145    18762496 : void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     146             : #endif
     147             : #ifdef SINGLE_PRECISION_REAL
     148     3442752 : void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     149             : #endif
     150             : {
     151             :         int i;
     152    22205248 :         int nb = *pnb;
     153    22205248 :         int nq = *pldq;
     154    22205248 :         int ldq = *pldq;
     155    22205248 :         int ldh = *pldh;
     156             :         int worked_on;
     157             : 
     158    22205248 :         worked_on = 0;
     159             : 
     160             :         // calculating scalar product to compute
     161             :         // 2 householder vectors simultaneously
     162             : #ifdef DOUBLE_PRECISION_REAL
     163    18762496 :         double s = hh[(ldh)+1]*1.0;
     164             : #endif
     165             : #ifdef SINGLE_PRECISION_REAL
     166     3442752 :         float s = hh[(ldh)+1]*1.0f;
     167             : #endif
     168             :         #pragma ivdep
     169  1387088064 :         for (i = 2; i < nb; i++)
     170             :         {
     171  1364882816 :                 s += hh[i-1] * hh[(i+ldh)];
     172             :         }
     173             : 
     174             :         // Production level kernel calls with padding
     175             : #ifdef DOUBLE_PRECISION_REAL
     176    37524992 :         for (i = 0; i < nq-24; i+=32)
     177             :         {
     178    18762496 :                 hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     179    18762496 :                 worked_on += 32;
     180             :         }
     181             : #endif
     182             : #ifdef SINGLE_PRECISION_REAL
     183     6885504 :         for (i = 0; i < nq-48; i+=64)
     184             :         {
     185     3442752 :                 hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     186     3442752 :                 worked_on += 64;
     187             :         }
     188             : #endif
     189    22205248 :         if (nq == i)
     190             :         {
     191           0 :                 return;
     192             :         }
     193             : #ifdef DOUBLE_PRECISION_REAL
     194    18762496 :         if (nq-i == 24)
     195             :         {
     196           0 :                 hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     197           0 :                 worked_on += 24;
     198             :         }
     199             : #endif
     200             : #ifdef SINGLE_PRECISION_REAL
     201     3442752 :         if (nq-i == 48)
     202             :         {
     203           0 :                 hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     204           0 :                 worked_on += 48;
     205             :         }
     206             : #endif
     207             : #ifdef DOUBLE_PRECISION_REAL
     208    18762496 :         if (nq-i == 16)
     209             :         {
     210    17571840 :                 hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     211    17571840 :                 worked_on += 16;
     212             :         }
     213             : #endif
     214             : #ifdef SINGLE_PRECISION_REAL
     215     3442752 :         if (nq-i == 32)
     216             :         {
     217     3194880 :                 hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     218     3194880 :                 worked_on += 32;
     219             :         }
     220             : #endif
     221             : #ifdef DOUBLE_PRECISION_REAL
     222    18762496 :         if (nq-i == 8)
     223             :         {
     224     1190656 :                 hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     225     1190656 :                 worked_on += 8;
     226             :         }
     227             : #endif
     228             : #ifdef SINGLE_PRECISION_REAL
     229     3442752 :         if (nq-i == 16)
     230             :         {
     231      247872 :                 hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     232      247872 :                 worked_on += 16;
     233             :         }
     234             : #endif
     235             : 
     236             : #ifdef WITH_DEBUG
     237             :         if (worked_on != nq)
     238             :         {
     239             :                  printf("Error in AVX512 real BLOCK 2 kernel \n");
     240             :                  abort();
     241             :         }
     242             : #endif
     243             : }
     244             : 
     245             : /**
     246             :  * Unrolled kernel that computes
     247             : #ifdef DOUBLE_PRECISION_REAL
     248             :  * 32 rows of Q simultaneously, a
     249             : #endif
     250             : #ifdef SINGLE_PRECISION_REAL
     251             :  * 64 rows of Q simultaneously, a
     252             : #endif
     253             : 
     254             :  * matrix Vector product with two householder
     255             :  * vectors + a rank 2 update is performed
     256             :  */
     257             : #ifdef DOUBLE_PRECISION_REAL
     258             :  __forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     259             : #endif
     260             : #ifdef SINGLE_PRECISION_REAL
     261             :  __forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     262             : #endif
     263             : {
     264             :         /////////////////////////////////////////////////////
     265             :         // Matrix Vector Multiplication, Q [24 x nb+1] * hh
     266             :         // hh contains two householder vectors, with offset 1
     267             :         /////////////////////////////////////////////////////
     268             :         int i;
     269             :         // Needed bit mask for floating point sign flip
     270             : #ifdef DOUBLE_PRECISION_REAL
     271    18762496 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     272             : #endif
     273             : #ifdef SINGLE_PRECISION_REAL
     274     3442752 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     275             : #endif
     276             : 
     277    44410496 :         __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
     278    44410496 :         __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
     279    44410496 :         __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
     280    44410496 :         __AVX512_DATATYPE x4 = _AVX512_LOAD(&q[ldq+3*offset]);
     281             : 
     282             : 
     283    44410496 :         __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
     284             :         __AVX512_DATATYPE h2;
     285             : 
     286    22205248 :         __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
     287    22205248 :         __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
     288    44410496 :         __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
     289    22205248 :         __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
     290    44410496 :         __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
     291    22205248 :         __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
     292    44410496 :         __AVX512_DATATYPE q4 = _AVX512_LOAD(&q[3*offset]);
     293    22205248 :         __AVX512_DATATYPE y4 = _AVX512_FMA(x4, h1, q4);
     294             : 
     295  1387088064 :         for(i = 2; i < nb; i++)
     296             :         {
     297  2729765632 :                 h1 = _AVX512_SET1(hh[i-1]);
     298  2729765632 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     299             : 
     300  2729765632 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     301  1364882816 :                 x1 = _AVX512_FMA(q1, h1, x1);
     302  1364882816 :                 y1 = _AVX512_FMA(q1, h2, y1);
     303  2729765632 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     304  1364882816 :                 x2 = _AVX512_FMA(q2, h1, x2);
     305  1364882816 :                 y2 = _AVX512_FMA(q2, h2, y2);
     306  2729765632 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
     307  1364882816 :                 x3 = _AVX512_FMA(q3, h1, x3);
     308  1364882816 :                 y3 = _AVX512_FMA(q3, h2, y3);
     309  2729765632 :                 q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
     310  1364882816 :                 x4 = _AVX512_FMA(q4, h1, x4);
     311  1364882816 :                 y4 = _AVX512_FMA(q4, h2, y4);
     312             : 
     313             :         }
     314             : 
     315    44410496 :         h1 = _AVX512_SET1(hh[nb-1]);
     316             : 
     317    44410496 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     318    22205248 :         x1 = _AVX512_FMA(q1, h1, x1);
     319    44410496 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     320    22205248 :         x2 = _AVX512_FMA(q2, h1, x2);
     321    44410496 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
     322    22205248 :         x3 = _AVX512_FMA(q3, h1, x3);
     323    44410496 :         q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
     324    22205248 :         x4 = _AVX512_FMA(q4, h1, x4);
     325             : 
     326             : 
     327             :         /////////////////////////////////////////////////////
     328             :         // Rank-2 update of Q [24 x nb+1]
     329             :         /////////////////////////////////////////////////////
     330             : 
     331    44410496 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
     332    44410496 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
     333    22205248 :         __AVX512_DATATYPE vs = _AVX512_SET1(s);
     334             : 
     335             : #ifdef DOUBLE_PRECISION_REAL
     336    37524992 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
     337             : #endif
     338             : #ifdef SINGLE_PRECISION_REAL
     339     6885504 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
     340             : #endif
     341    22205248 :         x1 = _AVX512_MUL(x1, h1);
     342    22205248 :         x2 = _AVX512_MUL(x2, h1);
     343    22205248 :         x3 = _AVX512_MUL(x3, h1);
     344    22205248 :         x4 = _AVX512_MUL(x4, h1);
     345             : 
     346             :         // check ofr xor_pd on skylake
     347             : #ifdef DOUBLE_PRECISION_REAL
     348    37524992 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
     349             : #endif
     350             : #ifdef SINGLE_PRECISION_REAL
     351     6885504 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
     352             : #endif
     353    22205248 :         h2 = _AVX512_MUL(h1, vs);
     354    44410496 :         y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
     355    44410496 :         y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
     356    44410496 :         y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
     357    44410496 :         y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2));
     358             : 
     359    22205248 :         q1 = _AVX512_LOAD(q);
     360    22205248 :         q1 = _AVX512_ADD(q1, y1);
     361             :         _AVX512_STORE(q,q1);
     362    44410496 :         q2 = _AVX512_LOAD(&q[offset]);
     363    22205248 :         q2 = _AVX512_ADD(q2, y2);
     364    22205248 :         _AVX512_STORE(&q[offset],q2);
     365    44410496 :         q3 = _AVX512_LOAD(&q[2*offset]);
     366    22205248 :         q3 = _AVX512_ADD(q3, y3);
     367    22205248 :         _AVX512_STORE(&q[2*offset],q3);
     368    44410496 :         q4 = _AVX512_LOAD(&q[3*offset]);
     369    22205248 :         q4 = _AVX512_ADD(q4, y4);
     370    22205248 :         _AVX512_STORE(&q[3*offset],q4);
     371             : 
     372    44410496 :         h2 = _AVX512_SET1(hh[ldh+1]);
     373             : 
     374    44410496 :         q1 = _AVX512_LOAD(&q[ldq]);
     375    44410496 :         q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
     376    22205248 :         _AVX512_STORE(&q[ldq],q1);
     377    44410496 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
     378    44410496 :         q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
     379    22205248 :         _AVX512_STORE(&q[ldq+offset],q2);
     380    44410496 :         q3 = _AVX512_LOAD(&q[ldq+2*offset]);
     381    44410496 :         q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
     382    22205248 :         _AVX512_STORE(&q[ldq+2*offset],q3);
     383    44410496 :         q4 = _AVX512_LOAD(&q[ldq+3*offset]);
     384    44410496 :         q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4));
     385    22205248 :         _AVX512_STORE(&q[ldq+3*offset],q4);
     386             : 
     387  1387088064 :         for (i = 2; i < nb; i++)
     388             :         {
     389  2729765632 :                 h1 = _AVX512_SET1(hh[i-1]);
     390  2729765632 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     391             : 
     392  2729765632 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     393  1364882816 :                 q1 = _AVX512_FMA(x1, h1, q1);
     394  1364882816 :                 q1 = _AVX512_FMA(y1, h2, q1);
     395  1364882816 :                 _AVX512_STORE(&q[i*ldq],q1);
     396  2729765632 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     397  1364882816 :                 q2 = _AVX512_FMA(x2, h1, q2);
     398  1364882816 :                 q2 = _AVX512_FMA(y2, h2, q2);
     399  1364882816 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
     400  2729765632 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
     401  1364882816 :                 q3 = _AVX512_FMA(x3, h1, q3);
     402  1364882816 :                 q3 = _AVX512_FMA(y3, h2, q3);
     403  1364882816 :                 _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
     404  2729765632 :                 q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
     405  1364882816 :                 q4 = _AVX512_FMA(x4, h1, q4);
     406  1364882816 :                 q4 = _AVX512_FMA(y4, h2, q4);
     407  1364882816 :                 _AVX512_STORE(&q[(i*ldq)+3*offset],q4);
     408             : 
     409             :         }
     410             : 
     411    44410496 :         h1 = _AVX512_SET1(hh[nb-1]);
     412             : 
     413    44410496 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     414    22205248 :         q1 = _AVX512_FMA(x1, h1, q1);
     415    22205248 :         _AVX512_STORE(&q[nb*ldq],q1);
     416    44410496 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     417    22205248 :         q2 = _AVX512_FMA(x2, h1, q2);
     418    22205248 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
     419    44410496 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
     420    22205248 :         q3 = _AVX512_FMA(x3, h1, q3);
     421    22205248 :         _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
     422    44410496 :         q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
     423    22205248 :         q4 = _AVX512_FMA(x4, h1, q4);
     424    22205248 :         _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
     425             : 
     426             : }
     427             : 
     428             : /**
     429             :  * Unrolled kernel that computes
     430             : #ifdef DOUBLE_PRECISION_REAL
     431             :  * 24 rows of Q simultaneously, a
     432             : #endif
     433             : #ifdef SINGLE_PRECISION_REAL
     434             :  * 48 rows of Q simultaneously, a
     435             : #endif
     436             : 
     437             :  * matrix Vector product with two householder
     438             :  * vectors + a rank 2 update is performed
     439             :  */
     440             : #ifdef DOUBLE_PRECISION_REAL
     441             :  __forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     442             : #endif
     443             : #ifdef SINGLE_PRECISION_REAL
     444             :  __forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     445             : #endif
     446             : {
     447             :         /////////////////////////////////////////////////////
     448             :         // Matrix Vector Multiplication, Q [24 x nb+1] * hh
     449             :         // hh contains two householder vectors, with offset 1
     450             :         /////////////////////////////////////////////////////
     451             :         int i;
     452             :         // Needed bit mask for floating point sign flip
     453             : #ifdef DOUBLE_PRECISION_REAL
     454           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     455             : #endif
     456             : #ifdef SINGLE_PRECISION_REAL
     457           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     458             : #endif
     459           0 :         __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
     460           0 :         __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
     461           0 :         __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
     462             : 
     463           0 :         __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
     464             :         __AVX512_DATATYPE h2;
     465             : 
     466           0 :         __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
     467           0 :         __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
     468           0 :         __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
     469           0 :         __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
     470           0 :         __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
     471           0 :         __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
     472             : 
     473           0 :         for(i = 2; i < nb; i++)
     474             :         {
     475           0 :                 h1 = _AVX512_SET1(hh[i-1]);
     476           0 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     477             : 
     478           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     479           0 :                 x1 = _AVX512_FMA(q1, h1, x1);
     480           0 :                 y1 = _AVX512_FMA(q1, h2, y1);
     481           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     482           0 :                 x2 = _AVX512_FMA(q2, h1, x2);
     483           0 :                 y2 = _AVX512_FMA(q2, h2, y2);
     484           0 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
     485           0 :                 x3 = _AVX512_FMA(q3, h1, x3);
     486           0 :                 y3 = _AVX512_FMA(q3, h2, y3);
     487             :         }
     488             : 
     489           0 :         h1 = _AVX512_SET1(hh[nb-1]);
     490             : 
     491           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     492           0 :         x1 = _AVX512_FMA(q1, h1, x1);
     493           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     494           0 :         x2 = _AVX512_FMA(q2, h1, x2);
     495           0 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
     496           0 :         x3 = _AVX512_FMA(q3, h1, x3);
     497             : 
     498             :         /////////////////////////////////////////////////////
     499             :         // Rank-2 update of Q [24 x nb+1]
     500             :         /////////////////////////////////////////////////////
     501             : 
     502           0 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
     503           0 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
     504           0 :         __AVX512_DATATYPE vs = _AVX512_SET1(s);
     505             : 
     506             :         // check for xor_pd on skylake
     507             : #ifdef DOUBLE_PRECISION_REAL
     508           0 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
     509             : #endif
     510             : #ifdef SINGLE_PRECISION_REAL
     511           0 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
     512             : #endif
     513           0 :         x1 = _AVX512_MUL(x1, h1);
     514           0 :         x2 = _AVX512_MUL(x2, h1);
     515           0 :         x3 = _AVX512_MUL(x3, h1);
     516             : 
     517             : #ifdef DOUBLE_PRECISION_REAL
     518           0 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
     519             : #endif
     520             : #ifdef SINGLE_PRECISION_REAL
     521           0 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
     522             : #endif
     523           0 :         h2 = _AVX512_MUL(h1, vs);
     524           0 :         y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
     525           0 :         y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
     526           0 :         y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
     527             : 
     528           0 :         q1 = _AVX512_LOAD(q);
     529           0 :         q1 = _AVX512_ADD(q1, y1);
     530             :         _AVX512_STORE(q,q1);
     531           0 :         q2 = _AVX512_LOAD(&q[offset]);
     532           0 :         q2 = _AVX512_ADD(q2, y2);
     533           0 :         _AVX512_STORE(&q[offset],q2);
     534           0 :         q3 = _AVX512_LOAD(&q[2*offset]);
     535           0 :         q3 = _AVX512_ADD(q3, y3);
     536           0 :         _AVX512_STORE(&q[2*offset],q3);
     537             : 
     538           0 :         h2 = _AVX512_SET1(hh[ldh+1]);
     539             : 
     540           0 :         q1 = _AVX512_LOAD(&q[ldq]);
     541           0 :         q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
     542           0 :         _AVX512_STORE(&q[ldq],q1);
     543           0 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
     544           0 :         q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
     545           0 :         _AVX512_STORE(&q[ldq+offset],q2);
     546           0 :         q3 = _AVX512_LOAD(&q[ldq+2*offset]);
     547           0 :         q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
     548           0 :         _AVX512_STORE(&q[ldq+2*offset],q3);
     549             : 
     550           0 :         for (i = 2; i < nb; i++)
     551             :         {
     552           0 :                 h1 = _AVX512_SET1(hh[i-1]);
     553           0 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     554             : 
     555           0 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     556           0 :                 q1 = _AVX512_FMA(x1, h1, q1);
     557           0 :                 q1 = _AVX512_FMA(y1, h2, q1);
     558           0 :                 _AVX512_STORE(&q[i*ldq],q1);
     559           0 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     560           0 :                 q2 = _AVX512_FMA(x2, h1, q2);
     561           0 :                 q2 = _AVX512_FMA(y2, h2, q2);
     562           0 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
     563           0 :                 q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
     564           0 :                 q3 = _AVX512_FMA(x3, h1, q3);
     565           0 :                 q3 = _AVX512_FMA(y3, h2, q3);
     566           0 :                 _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
     567             : 
     568             :         }
     569             : 
     570           0 :         h1 = _AVX512_SET1(hh[nb-1]);
     571             : 
     572           0 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     573           0 :         q1 = _AVX512_FMA(x1, h1, q1);
     574           0 :         _AVX512_STORE(&q[nb*ldq],q1);
     575           0 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     576           0 :         q2 = _AVX512_FMA(x2, h1, q2);
     577           0 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
     578           0 :         q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
     579           0 :         q3 = _AVX512_FMA(x3, h1, q3);
     580           0 :         _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
     581             : 
     582             : }
     583             : 
     584             : /**
     585             :  * Unrolled kernel that computes
     586             : #ifdef DOUBLE_PRECISION_REAL
     587             :  * 16 rows of Q simultaneously, a
     588             : #endif
     589             : #ifdef SINGLE_PRECISION_REAL
     590             :  * 32 rows of Q simultaneously, a
     591             : #endif
     592             :  * matrix Vector product with two householder
     593             :  * vectors + a rank 2 update is performed
     594             :  */
     595             : #ifdef DOUBLE_PRECISION_REAL
     596             :  __forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     597             : #endif
     598             : #ifdef SINGLE_PRECISION_REAL
     599             :  __forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     600             : #endif
     601             : {
     602             :         /////////////////////////////////////////////////////
     603             :         // Matrix Vector Multiplication, Q [16 x nb+1] * hh
     604             :         // hh contains two householder vectors, with offset 1
     605             :         /////////////////////////////////////////////////////
     606             :         int i;
     607             :         // Needed bit mask for floating point sign flip
     608             : #ifdef DOUBLE_PRECISION_REAL
     609    17571840 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     610             : #endif
     611             : #ifdef SINGLE_PRECISION_REAL
     612     3194880 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     613             : #endif
     614    41533440 :         __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
     615    41533440 :         __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
     616             : 
     617    41533440 :         __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
     618             :         __AVX512_DATATYPE h2;
     619             : 
     620    20766720 :         __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
     621    20766720 :         __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
     622    41533440 :         __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
     623    20766720 :         __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
     624             : 
     625  1308303360 :         for(i = 2; i < nb; i++)
     626             :         {
     627  2575073280 :                 h1 = _AVX512_SET1(hh[i-1]);
     628  2575073280 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     629             : 
     630  2575073280 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     631  1287536640 :                 x1 = _AVX512_FMA(q1, h1, x1);
     632  1287536640 :                 y1 = _AVX512_FMA(q1, h2, y1);
     633  2575073280 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     634  1287536640 :                 x2 = _AVX512_FMA(q2, h1, x2);
     635  1287536640 :                 y2 = _AVX512_FMA(q2, h2, y2);
     636             :         }
     637             : 
     638    41533440 :         h1 = _AVX512_SET1(hh[nb-1]);
     639             : 
     640    41533440 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     641    20766720 :         x1 = _AVX512_FMA(q1, h1, x1);
     642    41533440 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     643    20766720 :         x2 = _AVX512_FMA(q2, h1, x2);
     644             : 
     645             :         /////////////////////////////////////////////////////
     646             :         // Rank-2 update of Q [16 x nb+1]
     647             :         /////////////////////////////////////////////////////
     648             : 
     649    41533440 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
     650    41533440 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
     651    20766720 :         __AVX512_DATATYPE vs = _AVX512_SET1(s);
     652             :         // check for xor_pd on skylake
     653             : #ifdef DOUBLE_PRECISION_REAL
     654    35143680 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
     655             : #endif
     656             : #ifdef SINGLE_PRECISION_REAL
     657     6389760 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
     658             : #endif
     659    20766720 :         x1 = _AVX512_MUL(x1, h1);
     660    20766720 :         x2 = _AVX512_MUL(x2, h1);
     661             : #ifdef DOUBLE_PRECISION_REAL
     662    35143680 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
     663             : #endif
     664             : #ifdef SINGLE_PRECISION_REAL
     665     6389760 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
     666             : #endif
     667    20766720 :         h2 = _AVX512_MUL(h1, vs);
     668             : 
     669    41533440 :         y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
     670    41533440 :         y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
     671             : 
     672    20766720 :         q1 = _AVX512_LOAD(q);
     673    20766720 :         q1 = _AVX512_ADD(q1, y1);
     674             :         _AVX512_STORE(q,q1);
     675    41533440 :         q2 = _AVX512_LOAD(&q[offset]);
     676    20766720 :         q2 = _AVX512_ADD(q2, y2);
     677    20766720 :         _AVX512_STORE(&q[offset],q2);
     678             : 
     679    41533440 :         h2 = _AVX512_SET1(hh[ldh+1]);
     680             : 
     681    41533440 :         q1 = _AVX512_LOAD(&q[ldq]);
     682    41533440 :         q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
     683    20766720 :         _AVX512_STORE(&q[ldq],q1);
     684    41533440 :         q2 = _AVX512_LOAD(&q[ldq+offset]);
     685    41533440 :         q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
     686    20766720 :         _AVX512_STORE(&q[ldq+offset],q2);
     687             : 
     688  1308303360 :         for (i = 2; i < nb; i++)
     689             :         {
     690  2575073280 :                 h1 = _AVX512_SET1(hh[i-1]);
     691  2575073280 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     692             : 
     693  2575073280 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     694  1287536640 :                 q1 = _AVX512_FMA(x1, h1, q1);
     695  1287536640 :                 q1 = _AVX512_FMA(y1, h2, q1);
     696  1287536640 :                 _AVX512_STORE(&q[i*ldq],q1);
     697  2575073280 :                 q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
     698  1287536640 :                 q2 = _AVX512_FMA(x2, h1, q2);
     699  1287536640 :                 q2 = _AVX512_FMA(y2, h2, q2);
     700  1287536640 :                 _AVX512_STORE(&q[(i*ldq)+offset],q2);
     701             :         }
     702             : 
     703    41533440 :         h1 = _AVX512_SET1(hh[nb-1]);
     704             : 
     705    41533440 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     706    20766720 :         q1 = _AVX512_FMA(x1, h1, q1);
     707    20766720 :         _AVX512_STORE(&q[nb*ldq],q1);
     708    41533440 :         q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
     709    20766720 :         q2 = _AVX512_FMA(x2, h1, q2);
     710    20766720 :         _AVX512_STORE(&q[(nb*ldq)+offset],q2);
     711             : 
     712             : }
     713             : 
     714             : /**
     715             :  * Unrolled kernel that computes
     716             : #ifdef DOUBLE_PRECISION_REAL
     717             :  * 8 rows of Q simultaneously, a
     718             : #endif
     719             : #ifdef SINGLE_PRECISION_REAL
     720             :  * 16 rows of Q simultaneously, a
     721             : #endif
     722             :  * matrix Vector product with two householder
     723             :  * vectors + a rank 2 update is performed
     724             :  */
     725             : #ifdef DOUBLE_PRECISION_REAL
     726             :  __forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
     727             : #endif
     728             : #ifdef SINGLE_PRECISION_REAL
     729             :  __forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
     730             : #endif
     731             : {
     732             :         /////////////////////////////////////////////////////
     733             :         // Matrix Vector Multiplication, Q [8 x nb+1] * hh
     734             :         // hh contains two householder vectors, with offset 1
     735             :         /////////////////////////////////////////////////////
     736             :         int i;
     737             :         // Needed bit mask for floating point sign flip
     738             : #ifdef DOUBLE_PRECISION_REAL
     739     1190656 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     740             : #endif
     741             : #ifdef SINGLE_PRECISION_REAL
     742      247872 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     743             : #endif
     744     2877056 :         __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
     745             : 
     746     2877056 :         __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
     747             :         __AVX512_DATATYPE h2;
     748             : 
     749     1438528 :         __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
     750     1438528 :         __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
     751             : 
     752    78784704 :         for(i = 2; i < nb; i++)
     753             :         {
     754   154692352 :                 h1 = _AVX512_SET1(hh[i-1]);
     755   154692352 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     756             : 
     757   154692352 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     758    77346176 :                 x1 = _AVX512_FMA(q1, h1, x1);
     759    77346176 :                 y1 = _AVX512_FMA(q1, h2, y1);
     760             :         }
     761             : 
     762     2877056 :         h1 = _AVX512_SET1(hh[nb-1]);
     763             : 
     764     2877056 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     765     1438528 :         x1 = _AVX512_FMA(q1, h1, x1);
     766             : 
     767             :         /////////////////////////////////////////////////////
     768             :         // Rank-2 update of Q [8 x nb+1]
     769             :         /////////////////////////////////////////////////////
     770             : 
     771     2877056 :         __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
     772     2877056 :         __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
     773     1438528 :         __AVX512_DATATYPE vs = _AVX512_SET1(s);
     774             : 
     775             : #ifdef DOUBLE_PRECISION_REAL
     776     2381312 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
     777             : #endif
     778             : #ifdef SINGLE_PRECISION_REAL
     779      495744 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
     780             : #endif
     781             : 
     782     1438528 :         x1 = _AVX512_MUL(x1, h1);
     783             : 
     784             : #ifdef DOUBLE_PRECISION_REAL
     785     2381312 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
     786             : #endif
     787             : #ifdef SINGLE_PRECISION_REAL
     788      495744 :         h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
     789             : #endif
     790             : 
     791     1438528 :         h2 = _AVX512_MUL(h1, vs);
     792             : 
     793     2877056 :         y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
     794             : 
     795     1438528 :         q1 = _AVX512_LOAD(q);
     796     1438528 :         q1 = _AVX512_ADD(q1, y1);
     797             :         _AVX512_STORE(q,q1);
     798             : 
     799     2877056 :         h2 = _AVX512_SET1(hh[ldh+1]);
     800             : 
     801     2877056 :         q1 = _AVX512_LOAD(&q[ldq]);
     802     2877056 :         q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
     803     1438528 :         _AVX512_STORE(&q[ldq],q1);
     804             : 
     805    78784704 :         for (i = 2; i < nb; i++)
     806             :         {
     807   154692352 :                 h1 = _AVX512_SET1(hh[i-1]);
     808   154692352 :                 h2 = _AVX512_SET1(hh[ldh+i]);
     809             : 
     810   154692352 :                 q1 = _AVX512_LOAD(&q[i*ldq]);
     811    77346176 :                 q1 = _AVX512_FMA(x1, h1, q1);
     812    77346176 :                 q1 = _AVX512_FMA(y1, h2, q1);
     813    77346176 :                 _AVX512_STORE(&q[i*ldq],q1);
     814             :         }
     815             : 
     816     2877056 :         h1 = _AVX512_SET1(hh[nb-1]);
     817             : 
     818     2877056 :         q1 = _AVX512_LOAD(&q[nb*ldq]);
     819     1438528 :         q1 = _AVX512_FMA(x1, h1, q1);
     820     1438528 :         _AVX512_STORE(&q[nb*ldq],q1);
     821             : 
     822             : }
     823             : 

Generated by: LCOV version 1.12