LCOV - code coverage report
Current view: top level - src/elpa2/kernels - complex_avx512_2hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 428 610 70.2 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
      46             : // --------------------------------------------------------------------------------------------------
      47             : #include "config-f90.h"
      48             : 
      49             : #include <complex.h>
      50             : #include <x86intrin.h>
      51             : #include <stdio.h>
      52             : #include <stdlib.h>
      53             : 
      54             : #define __forceinline __attribute__((always_inline))
      55             : 
      56             : #ifdef DOUBLE_PRECISION_COMPLEX
      57             : #define offset 8
      58             : 
      59             : #define __AVX512_DATATYPE __m512d
      60             : #define _AVX512_LOAD _mm512_load_pd
      61             : #define _AVX512_STORE _mm512_store_pd
      62             : #define _AVX512_SET1 _mm512_set1_pd
      63             : #define _AVX512_SET _mm512_set_pd
      64             : #define _AVX512_MUL _mm512_mul_pd
      65             : #define _AVX512_ADD _mm512_add_pd
      66             : #define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
      67             : #define _AVX512_SHUFFLE _mm512_shuffle_pd
      68             : #define _SHUFFLE 0x55
      69             : 
      70             : #ifdef HAVE_AVX512
      71             : 
      72             : #define __ELPA_USE_FMA__
      73             : #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
      74             : #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
      75             : #endif
      76             : 
      77             : #define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
      78             : #define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
      79             : #endif /* DOUBLE_PRECISION_COMPLEX */
      80             : 
      81             : #ifdef SINGLE_PRECISION_COMPLEX
      82             : #define offset 16
      83             : 
      84             : #define __AVX512_DATATYPE __m512
      85             : #define _AVX512_LOAD _mm512_load_ps
      86             : #define _AVX512_STORE _mm512_store_ps
      87             : #define _AVX512_SET1 _mm512_set1_ps
      88             : #define _AVX512_SET _mm512_set_ps
      89             : #define _AVX512_MUL _mm512_mul_ps
      90             : #define _AVX512_ADD _mm512_add_ps
      91             : #define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
      92             : #define _AVX512_SHUFFLE _mm512_shuffle_ps
      93             : #define _SHUFFLE 0xb1
      94             : 
      95             : #ifdef HAVE_AVX512
      96             : 
      97             : #define __ELPA_USE_FMA__
      98             : #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
      99             : #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
     100             : #endif
     101             : 
     102             : #define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
     103             : #define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
     104             : #endif /* SINGLE_PRECISION_COMPLEX */
     105             : 
     106             : //Forward declaration
     107             : #ifdef DOUBLE_PRECISION_COMPLEX
     108             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
     109             : static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
     110             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
     111             : static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
     112             : #endif
     113             : 
     114             : #ifdef SINGLE_PRECISION_COMPLEX
     115             : static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
     116             : static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
     117             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
     118             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
     119             : #endif
     120             : 
     121             : /*
     122             : !f>#if defined(HAVE_AVX512)
     123             : !f> interface
     124             : !f>   subroutine double_hh_trafo_complex_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
     125             : !f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_double")
     126             : !f>     use, intrinsic :: iso_c_binding
     127             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     128             : !f>     ! complex(kind=c_double_complex)     :: q(*)
     129             : !f>     type(c_ptr), value                   :: q
     130             : !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
     131             : !f>   end subroutine
     132             : !f> end interface
     133             : !f>#endif
     134             : */
     135             : 
     136             : /*
     137             : !f>#if defined(HAVE_AVX512)
     138             : !f> interface
     139             : !f>   subroutine double_hh_trafo_complex_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
     140             : !f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_single")
     141             : !f>     use, intrinsic :: iso_c_binding
     142             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
     143             : !f>     ! complex(kind=c_float_complex)     :: q(*)
     144             : !f>     type(c_ptr), value                  :: q
     145             : !f>     complex(kind=c_float_complex)     :: hh(pnb,2)
     146             : !f>   end subroutine
     147             : !f> end interface
     148             : !f>#endif
     149             : */
     150             : 
     151             : #ifdef DOUBLE_PRECISION_COMPLEX
     152      327168 : void double_hh_trafo_complex_avx512_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     153             : #endif
     154             : #ifdef SINGLE_PRECISION_COMPLEX
     155      163584 : void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
     156             : #endif
     157             : {
     158             :         int i;
     159      490752 :         int nb = *pnb;
     160      490752 :         int nq = *pldq;
     161      490752 :         int ldq = *pldq;
     162      490752 :         int ldh = *pldh;
     163             :         int worked_on;
     164             : 
     165      490752 :         worked_on = 0;
     166             : 
     167             : #ifdef DOUBLE_PRECISION_COMPLEX
     168      327168 :         double complex s = conj(hh[(ldh)+1])*1.0;
     169             : #endif
     170             : #ifdef SINGLE_PRECISION_COMPLEX
     171      163584 :         float complex s = conj(hh[(ldh)+1])*1.0f;
     172             : #endif
     173    15213312 :         for (i = 2; i < nb; i++)
     174             :         {
     175    14722560 :                 s += hh[i-1] * conj(hh[(i+ldh)]);
     176             :         }
     177             : 
     178             : #ifdef DOUBLE_PRECISION_COMPLEX
     179      981504 :         for (i = 0; i < nq-12; i+=16)
     180             :         {
     181      654336 :                 hh_trafo_complex_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     182      654336 :                 worked_on += 16;
     183             :         }
     184             : #endif
     185             : #ifdef SINGLE_PRECISION_COMPLEX
     186      327168 :         for (i = 0; i < nq-24; i+=32)
     187             :         {
     188      163584 :                 hh_trafo_complex_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     189      163584 :                 worked_on += 32;
     190             : 
     191             :         }
     192             : #endif
     193      490752 :         if (nq-i == 0) {
     194           0 :                 return;
     195             :         }
     196             : #ifdef DOUBLE_PRECISION_COMPLEX
     197      327168 :         if (nq-i == 12 )
     198             :         {
     199           0 :                 hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     200           0 :                 worked_on += 12;
     201             :         }
     202             : #endif
     203             : #ifdef SINGLE_PRECISION_COMPLEX
     204      163584 :         if (nq-i == 24 )
     205             :         {
     206           0 :                 hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     207           0 :                 worked_on += 24;
     208             :         }
     209             : #endif
     210             : #ifdef DOUBLE_PRECISION_COMPLEX
     211      327168 :         if (nq-i == 8 )
     212             :         {
     213      327168 :                 hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     214      327168 :                 worked_on += 8;
     215             :         }
     216             : #endif
     217             : #ifdef SINGLE_PRECISION_COMPLEX
     218      163584 :         if (nq-i == 16 )
     219             :         {
     220       81792 :                 hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     221       81792 :                 worked_on += 16;
     222             :         }
     223             : #endif
     224             : 
     225             : #ifdef DOUBLE_PRECISION_COMPLEX
     226      327168 :         if (nq-i == 4 ) {
     227             : 
     228           0 :                 hh_trafo_complex_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
     229           0 :                 worked_on += 4;
     230             :         }
     231             : #endif
     232             : 
     233             : #ifdef SINGLE_PRECISION_COMPLEX
     234      163584 :         if (nq-i == 8 ) {
     235             : 
     236       81792 :                 hh_trafo_complex_kernel_8_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
     237       81792 :                 worked_on += 8;
     238             :         }
     239             : #endif
     240             : #ifdef WITH_DEBUG
     241             :         if (worked_on != nq)
     242             :         {
     243             :              printf("Error in complex AVX512 BLOCK 2 kernel \n");
     244             :              abort();
     245             :         }
     246             : #endif
     247             : }
     248             : 
     249             : #ifdef DOUBLE_PRECISION_COMPLEX
     250             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
     251             : #endif
     252             : #ifdef SINGLE_PRECISION_COMPLEX
     253             : static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
     254             : #endif
     255             : {
     256             : 
     257             : #ifdef DOUBLE_PRECISION_COMPLEX
     258      654336 :         double* q_dbl = (double*)q;
     259      654336 :         double* hh_dbl = (double*)hh;
     260      654336 :         double* s_dbl = (double*)(&s);
     261             : #endif
     262             : #ifdef SINGLE_PRECISION_COMPLEX
     263      163584 :         float* q_dbl = (float*)q;
     264      163584 :         float* hh_dbl = (float*)hh;
     265      163584 :         float* s_dbl = (float*)(&s);
     266             : #endif
     267             :         __AVX512_DATATYPE x1, x2, x3, x4;
     268             :         __AVX512_DATATYPE y1, y2, y3, y4;
     269             :         __AVX512_DATATYPE q1, q2, q3, q4;
     270             :         __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
     271             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
     272      817920 :         int i=0;
     273             : 
     274             : #ifdef DOUBLE_PRECISION_COMPLEX
     275      654336 :        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     276             : #endif
     277             : #ifdef SINGLE_PRECISION_COMPLEX
     278      163584 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     279             : #endif
     280             : 
     281     1635840 :         x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
     282     1635840 :         x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
     283     1635840 :         x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
     284     1635840 :         x4 = _AVX512_LOAD(&q_dbl[(2*ldq)+3*offset]); // q13, q14, q15, q16
     285             : 
     286     1635840 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
     287     1635840 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
     288             : 
     289      817920 :         y1 = _AVX512_LOAD(&q_dbl[0]);
     290     1635840 :         y2 = _AVX512_LOAD(&q_dbl[offset]);
     291     1635840 :         y3 = _AVX512_LOAD(&q_dbl[2*offset]);
     292     1635840 :         y4 = _AVX512_LOAD(&q_dbl[3*offset]);
     293             : 
     294      817920 :         tmp1 = _AVX512_MUL(h2_imag, x1);
     295             : 
     296     2453760 :         y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     297             : 
     298      817920 :         tmp2 = _AVX512_MUL(h2_imag, x2);
     299             : 
     300     2453760 :         y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     301             : 
     302      817920 :         tmp3 = _AVX512_MUL(h2_imag, x3);
     303             : 
     304     2453760 :         y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     305             : 
     306      817920 :         tmp4 = _AVX512_MUL(h2_imag, x4);
     307             : 
     308     2453760 :         y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     309             : 
     310    25355520 :         for (i = 2; i < nb; i++)
     311             :         {
     312    49075200 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     313    49075200 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     314    49075200 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     315    49075200 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     316             : 
     317    49075200 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
     318    49075200 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
     319             : 
     320    24537600 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     321             : 
     322    73612800 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     323             : 
     324    24537600 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     325             : 
     326    73612800 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     327             : 
     328    24537600 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     329             : 
     330    73612800 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     331             : 
     332    24537600 :                 tmp4 = _AVX512_MUL(h1_imag, q4);
     333             : 
     334    73612800 :                 x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     335             : 
     336    49075200 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
     337    49075200 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
     338             : 
     339    24537600 :                 tmp1 = _AVX512_MUL(h2_imag, q1);
     340             : 
     341    73612800 :                 y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     342             : 
     343    24537600 :                 tmp2 = _AVX512_MUL(h2_imag, q2);
     344             : 
     345    73612800 :                 y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     346             : 
     347    24537600 :                 tmp3 = _AVX512_MUL(h2_imag, q3);
     348             : 
     349    73612800 :                 y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     350             : 
     351    24537600 :                 tmp4 = _AVX512_MUL(h2_imag, q4);
     352             : 
     353    73612800 :                 y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     354             :         }
     355             : 
     356     1635840 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
     357     1635840 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
     358             : 
     359     1635840 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
     360     1635840 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
     361     1635840 :         q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
     362     1635840 :         q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
     363             : 
     364      817920 :         tmp1 = _AVX512_MUL(h1_imag, q1);
     365             : 
     366     2453760 :         x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     367             : 
     368      817920 :         tmp2 = _AVX512_MUL(h1_imag, q2);
     369             : 
     370     2453760 :         x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     371             : 
     372      817920 :         tmp3 = _AVX512_MUL(h1_imag, q3);
     373             : 
     374     2453760 :         x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     375             : 
     376      817920 :         tmp4 = _AVX512_MUL(h1_imag, q4);
     377             : 
     378     2453760 :         x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     379             : 
     380     1635840 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     381     1635840 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     382             : 
     383             : #ifdef DOUBLE_PRECISION_COMPLEX
     384     1308672 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
     385     1308672 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
     386             : #endif
     387             : #ifdef SINGLE_PRECISION_COMPLEX
     388      327168 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
     389      327168 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
     390             : #endif
     391      817920 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     392             : 
     393     1635840 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     394             : 
     395      817920 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     396             : 
     397     1635840 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     398             : 
     399      817920 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     400             : 
     401     1635840 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     402             : 
     403      817920 :         tmp4 = _AVX512_MUL(h1_imag, x4);
     404             : 
     405     1635840 :         x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     406             : 
     407     1635840 :         h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
     408     1635840 :         h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
     409     1635840 :         h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
     410     1635840 :         h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
     411             : 
     412             : #ifdef DOUBLE_PRECISION_COMPLEX
     413     1308672 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
     414     1308672 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
     415             : #endif
     416             : #ifdef SINGLE_PRECISION_COMPLEX
     417      327168 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
     418      327168 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
     419             : #endif
     420             : 
     421             : #ifdef DOUBLE_PRECISION_COMPLEX
     422     1308672 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
     423     1308672 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
     424             : #endif
     425             : #ifdef SINGLE_PRECISION_COMPLEX
     426      327168 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
     427      327168 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
     428             : #endif
     429             : 
     430             : #ifdef DOUBLE_PRECISION_COMPLEX
     431     3271680 :         tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
     432      654336 :                              s_dbl[1], s_dbl[0],
     433      654336 :                              s_dbl[1], s_dbl[0],
     434      654336 :                              s_dbl[1], s_dbl[0]);
     435             : #endif
     436             : #ifdef SINGLE_PRECISION_COMPLEX
     437      327168 :         tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
     438             : #endif
     439      817920 :         tmp1 = _AVX512_MUL(h2_imag, tmp2);
     440             : 
     441     1635840 :         tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     442             : 
     443             :         _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
     444             : 
     445     1635840 :         h2_real = _AVX512_SET1(s_dbl[0]);
     446     1635840 :         h2_imag = _AVX512_SET1(s_dbl[1]);
     447             : 
     448      817920 :         tmp1 = _AVX512_MUL(h1_imag, y1);
     449             : 
     450     1635840 :         y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     451             : 
     452      817920 :         tmp2 = _AVX512_MUL(h1_imag, y2);
     453             : 
     454     1635840 :         y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     455             : 
     456      817920 :         tmp3 = _AVX512_MUL(h1_imag, y3);
     457             : 
     458     1635840 :         y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     459             : 
     460      817920 :         tmp4 = _AVX512_MUL(h1_imag, y4);
     461             : 
     462     1635840 :         y4 = _AVX512_FMADDSUB(h1_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     463             : 
     464      817920 :         tmp1 = _AVX512_MUL(h2_imag, x1);
     465             : 
     466     2453760 :         y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     467             : 
     468      817920 :         tmp2 = _AVX512_MUL(h2_imag, x2);
     469             : 
     470     2453760 :         y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     471             : 
     472      817920 :         tmp3 = _AVX512_MUL(h2_imag, x3);
     473             : 
     474     2453760 :         y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     475             : 
     476      817920 :         tmp4 = _AVX512_MUL(h2_imag, x4);
     477             : 
     478     2453760 :         y4 = _AVX512_ADD(y4, _AVX512_FMADDSUB(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     479             : 
     480      817920 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     481     1635840 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     482     1635840 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     483     1635840 :         q4 = _AVX512_LOAD(&q_dbl[3*offset]);
     484             : 
     485      817920 :         q1 = _AVX512_ADD(q1, y1);
     486      817920 :         q2 = _AVX512_ADD(q2, y2);
     487      817920 :         q3 = _AVX512_ADD(q3, y3);
     488      817920 :         q4 = _AVX512_ADD(q4, y4);
     489             : 
     490             :         _AVX512_STORE(&q_dbl[0], q1);
     491      817920 :         _AVX512_STORE(&q_dbl[offset], q2);
     492      817920 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     493      817920 :         _AVX512_STORE(&q_dbl[3*offset], q4);
     494             : 
     495     1635840 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
     496     1635840 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
     497             : 
     498     1635840 :         q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
     499     1635840 :         q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
     500     1635840 :         q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
     501     1635840 :         q4 = _AVX512_LOAD(&q_dbl[(ldq*2)+3*offset]);
     502             : 
     503      817920 :         q1 = _AVX512_ADD(q1, x1);
     504      817920 :         q2 = _AVX512_ADD(q2, x2);
     505      817920 :         q3 = _AVX512_ADD(q3, x3);
     506      817920 :         q4 = _AVX512_ADD(q4, x4);
     507             : 
     508      817920 :         tmp1 = _AVX512_MUL(h2_imag, y1);
     509             : 
     510     2453760 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     511             : 
     512      817920 :         tmp2 = _AVX512_MUL(h2_imag, y2);
     513             : 
     514     2453760 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     515             : 
     516      817920 :         tmp3 = _AVX512_MUL(h2_imag, y3);
     517             : 
     518     2453760 :         q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     519             : 
     520      817920 :         tmp4 = _AVX512_MUL(h2_imag, y4);
     521             : 
     522     2453760 :         q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     523             : 
     524      817920 :         _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
     525      817920 :         _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
     526      817920 :         _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
     527      817920 :         _AVX512_STORE(&q_dbl[(ldq*2)+3*offset], q4);
     528             : 
     529    25355520 :         for (i = 2; i < nb; i++)
     530             :         {
     531    49075200 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     532    49075200 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     533    49075200 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     534    49075200 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     535             : 
     536    49075200 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
     537    49075200 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
     538             : 
     539    24537600 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     540             : 
     541    73612800 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     542             : 
     543    24537600 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     544             : 
     545    73612800 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     546             : 
     547    24537600 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     548             : 
     549    73612800 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     550             : 
     551    24537600 :                 tmp4 = _AVX512_MUL(h1_imag, x4);
     552             : 
     553    73612800 :                 q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     554             : 
     555    49075200 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
     556    49075200 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
     557             : 
     558    24537600 :                 tmp1 = _AVX512_MUL(h2_imag, y1);
     559             : 
     560    73612800 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     561             : 
     562    24537600 :                 tmp2 = _AVX512_MUL(h2_imag, y2);
     563             : 
     564    73612800 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     565             : 
     566    24537600 :                 tmp3 = _AVX512_MUL(h2_imag, y3);
     567             : 
     568    73612800 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     569             : 
     570    24537600 :                 tmp4 = _AVX512_MUL(h2_imag, y4);
     571             : 
     572    73612800 :                 q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     573             : 
     574    24537600 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     575    24537600 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     576    24537600 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     577    24537600 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     578             :         }
     579             : 
     580     1635840 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
     581     1635840 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
     582             : 
     583     1635840 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
     584     1635840 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
     585     1635840 :         q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
     586     1635840 :         q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
     587             : 
     588      817920 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     589             : 
     590     2453760 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     591             : 
     592      817920 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     593             : 
     594     2453760 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     595             : 
     596      817920 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     597             : 
     598     2453760 :         q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     599             : 
     600      817920 :         tmp4 = _AVX512_MUL(h1_imag, x4);
     601             : 
     602     2453760 :         q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     603             : 
     604      817920 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
     605      817920 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
     606      817920 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
     607      817920 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
     608             : }
     609             : 
     610             : 
     611             : #ifdef DOUBLE_PRECISION_COMPLEX
     612             : static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
     613             : #endif
     614             : #ifdef SINGLE_PRECISION_COMPLEX
     615             : static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
     616             : #endif
     617             : {
     618             : 
     619             : #ifdef DOUBLE_PRECISION_COMPLEX
     620           0 :         double* q_dbl = (double*)q;
     621           0 :         double* hh_dbl = (double*)hh;
     622           0 :         double* s_dbl = (double*)(&s);
     623             : #endif
     624             : #ifdef SINGLE_PRECISION_COMPLEX
     625           0 :         float* q_dbl = (float*)q;
     626           0 :         float* hh_dbl = (float*)hh;
     627           0 :         float* s_dbl = (float*)(&s);
     628             : #endif
     629             :         __AVX512_DATATYPE x1, x2, x3, x4;
     630             :         __AVX512_DATATYPE y1, y2, y3, y4;
     631             :         __AVX512_DATATYPE q1, q2, q3, q4;
     632             :         __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
     633             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
     634           0 :         int i=0;
     635             : 
     636             : #ifdef DOUBLE_PRECISION_COMPLEX
     637           0 :        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     638             : #endif
     639             : #ifdef SINGLE_PRECISION_COMPLEX
     640           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     641             : #endif
     642             : 
     643           0 :         x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
     644           0 :         x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
     645           0 :         x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
     646             : 
     647           0 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
     648           0 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
     649             : 
     650           0 :         y1 = _AVX512_LOAD(&q_dbl[0]);
     651           0 :         y2 = _AVX512_LOAD(&q_dbl[offset]);
     652           0 :         y3 = _AVX512_LOAD(&q_dbl[2*offset]);
     653             : 
     654           0 :         tmp1 = _AVX512_MUL(h2_imag, x1);
     655             : 
     656           0 :         y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     657             : 
     658           0 :         tmp2 = _AVX512_MUL(h2_imag, x2);
     659             : 
     660           0 :         y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     661             : 
     662           0 :         tmp3 = _AVX512_MUL(h2_imag, x3);
     663             : 
     664           0 :         y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     665             : 
     666           0 :         for (i = 2; i < nb; i++)
     667             :         {
     668           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     669           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     670           0 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     671             : 
     672           0 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
     673           0 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
     674             : 
     675           0 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     676             : 
     677           0 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     678             : 
     679           0 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     680             : 
     681           0 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     682             : 
     683           0 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     684             : 
     685           0 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     686             : 
     687           0 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
     688           0 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
     689             : 
     690           0 :                 tmp1 = _AVX512_MUL(h2_imag, q1);
     691             : 
     692           0 :                 y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     693             : 
     694           0 :                 tmp2 = _AVX512_MUL(h2_imag, q2);
     695             : 
     696           0 :                 y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     697             : 
     698           0 :                 tmp3 = _AVX512_MUL(h2_imag, q3);
     699             : 
     700           0 :                 y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     701             : 
     702             :         }
     703             : 
     704           0 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
     705           0 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
     706             : 
     707           0 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
     708           0 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
     709           0 :         q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
     710             : 
     711           0 :         tmp1 = _AVX512_MUL(h1_imag, q1);
     712             : 
     713           0 :         x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     714             : 
     715           0 :         tmp2 = _AVX512_MUL(h1_imag, q2);
     716             : 
     717           0 :         x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     718             : 
     719           0 :         tmp3 = _AVX512_MUL(h1_imag, q3);
     720             : 
     721           0 :         x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     722             : 
     723           0 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     724           0 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     725             : 
     726             : #ifdef DOUBLE_PRECISION_COMPLEX
     727           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
     728           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
     729             : #endif
     730             : #ifdef SINGLE_PRECISION_COMPLEX
     731           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
     732           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
     733             : #endif
     734           0 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     735             : 
     736           0 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     737             : 
     738           0 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     739             : 
     740           0 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     741             : 
     742           0 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     743             : 
     744           0 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     745             : 
     746           0 :         h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
     747           0 :         h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
     748           0 :         h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
     749           0 :         h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
     750             : 
     751             : #ifdef DOUBLE_PRECISION_COMPLEX
     752           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
     753           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
     754             : #endif
     755             : #ifdef SINGLE_PRECISION_COMPLEX
     756           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
     757           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
     758             : #endif
     759             : 
     760             : #ifdef DOUBLE_PRECISION_COMPLEX
     761           0 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
     762           0 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
     763             : #endif
     764             : #ifdef SINGLE_PRECISION_COMPLEX
     765           0 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
     766           0 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
     767             : #endif
     768             : 
     769             : #ifdef DOUBLE_PRECISION_COMPLEX
     770           0 :         tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
     771           0 :                              s_dbl[1], s_dbl[0],
     772           0 :                              s_dbl[1], s_dbl[0],
     773           0 :                              s_dbl[1], s_dbl[0]);
     774             : #endif
     775             : #ifdef SINGLE_PRECISION_COMPLEX
     776           0 :         tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
     777             : #endif
     778           0 :         tmp1 = _AVX512_MUL(h2_imag, tmp2);
     779             : 
     780           0 :         tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     781             : 
     782             :         _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
     783             : 
     784           0 :         h2_real = _AVX512_SET1(s_dbl[0]);
     785           0 :         h2_imag = _AVX512_SET1(s_dbl[1]);
     786             : 
     787           0 :         tmp1 = _AVX512_MUL(h1_imag, y1);
     788             : 
     789           0 :         y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     790             : 
     791           0 :         tmp2 = _AVX512_MUL(h1_imag, y2);
     792             : 
     793           0 :         y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     794             : 
     795           0 :         tmp3 = _AVX512_MUL(h1_imag, y3);
     796             : 
     797           0 :         y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     798             : 
     799           0 :         tmp1 = _AVX512_MUL(h2_imag, x1);
     800             : 
     801           0 :         y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     802             : 
     803           0 :         tmp2 = _AVX512_MUL(h2_imag, x2);
     804             : 
     805           0 :         y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     806             : 
     807           0 :         tmp3 = _AVX512_MUL(h2_imag, x3);
     808             : 
     809           0 :         y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     810             : 
     811           0 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     812           0 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     813           0 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     814             : 
     815           0 :         q1 = _AVX512_ADD(q1, y1);
     816           0 :         q2 = _AVX512_ADD(q2, y2);
     817           0 :         q3 = _AVX512_ADD(q3, y3);
     818             : 
     819             :         _AVX512_STORE(&q_dbl[0], q1);
     820           0 :         _AVX512_STORE(&q_dbl[offset], q2);
     821           0 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     822             : 
     823           0 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
     824           0 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
     825             : 
     826           0 :         q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
     827           0 :         q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
     828           0 :         q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
     829             : 
     830           0 :         q1 = _AVX512_ADD(q1, x1);
     831           0 :         q2 = _AVX512_ADD(q2, x2);
     832           0 :         q3 = _AVX512_ADD(q3, x3);
     833             : 
     834           0 :         tmp1 = _AVX512_MUL(h2_imag, y1);
     835             : 
     836           0 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     837             : 
     838           0 :         tmp2 = _AVX512_MUL(h2_imag, y2);
     839             : 
     840           0 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     841             : 
     842           0 :         tmp3 = _AVX512_MUL(h2_imag, y3);
     843             : 
     844           0 :         q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     845             : 
     846           0 :         _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
     847           0 :         _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
     848           0 :         _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
     849             : 
     850           0 :         for (i = 2; i < nb; i++)
     851             :         {
     852           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     853           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     854           0 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     855             : 
     856           0 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
     857           0 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
     858             : 
     859           0 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     860             : 
     861           0 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     862             : 
     863           0 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     864             : 
     865           0 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     866             : 
     867           0 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     868             : 
     869           0 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     870             : 
     871           0 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
     872           0 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
     873             : 
     874           0 :                 tmp1 = _AVX512_MUL(h2_imag, y1);
     875             : 
     876           0 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     877             : 
     878           0 :                 tmp2 = _AVX512_MUL(h2_imag, y2);
     879             : 
     880           0 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     881             : 
     882           0 :                 tmp3 = _AVX512_MUL(h2_imag, y3);
     883             : 
     884           0 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     885             : 
     886           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     887           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     888           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     889             :         }
     890             : 
     891           0 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
     892           0 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
     893             : 
     894           0 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
     895           0 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
     896           0 :         q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
     897             : 
     898           0 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     899             : 
     900           0 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     901             : 
     902           0 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     903             : 
     904           0 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     905             : 
     906           0 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     907             : 
     908           0 :         q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     909             : 
     910           0 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
     911           0 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
     912           0 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
     913             : }
     914             : 
     915             : 
     916             : #ifdef DOUBLE_PRECISION_COMPLEX
     917             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
     918             : #endif
     919             : #ifdef SINGLE_PRECISION_COMPLEX
     920             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
     921             : #endif
     922             : {
     923             : 
     924             : #ifdef DOUBLE_PRECISION_COMPLEX
     925      327168 :         double* q_dbl = (double*)q;
     926      327168 :         double* hh_dbl = (double*)hh;
     927      327168 :         double* s_dbl = (double*)(&s);
     928             : #endif
     929             : #ifdef SINGLE_PRECISION_COMPLEX
     930       81792 :         float* q_dbl = (float*)q;
     931       81792 :         float* hh_dbl = (float*)hh;
     932       81792 :         float* s_dbl = (float*)(&s);
     933             : #endif
     934             : 
     935             :         __AVX512_DATATYPE x1, x2;
     936             :         __AVX512_DATATYPE y1, y2;
     937             :         __AVX512_DATATYPE q1, q2;
     938             :         __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
     939             :         __AVX512_DATATYPE tmp1, tmp2;
     940      408960 :         int i=0;
     941             : 
     942             : #ifdef DOUBLE_PRECISION_COMPLEX
     943      327168 :        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
     944             : #endif
     945             : #ifdef SINGLE_PRECISION_COMPLEX
     946       81792 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     947             : #endif
     948             : 
     949      817920 :         x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
     950      817920 :         x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);
     951             : 
     952      817920 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
     953      817920 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
     954             : 
     955      408960 :         y1 = _AVX512_LOAD(&q_dbl[0]);
     956      817920 :         y2 = _AVX512_LOAD(&q_dbl[offset]);
     957             : 
     958      408960 :         tmp1 = _AVX512_MUL(h2_imag, x1);
     959             : 
     960     1226880 :         y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     961             : 
     962      408960 :         tmp2 = _AVX512_MUL(h2_imag, x2);
     963             : 
     964     1226880 :         y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     965             : 
     966    12677760 :         for (i = 2; i < nb; i++)
     967             :         {
     968    24537600 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     969    24537600 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     970             : 
     971    24537600 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
     972    24537600 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
     973             : 
     974    12268800 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     975             : 
     976    36806400 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     977             : 
     978    12268800 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     979             : 
     980    36806400 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     981             : 
     982    24537600 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
     983    24537600 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
     984             : 
     985    12268800 :                 tmp1 = _AVX512_MUL(h2_imag, q1);
     986             : 
     987    36806400 :                 y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     988             : 
     989    12268800 :                 tmp2 = _AVX512_MUL(h2_imag, q2);
     990             : 
     991    36806400 :                 y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     992             :         }
     993             : 
     994      817920 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
     995      817920 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
     996             : 
     997      817920 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
     998      817920 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
     999             : 
    1000      408960 :         tmp1 = _AVX512_MUL(h1_imag, q1);
    1001             : 
    1002     1226880 :         x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1003             : 
    1004      408960 :         tmp2 = _AVX512_MUL(h1_imag, q2);
    1005             : 
    1006     1226880 :         x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1007             : 
    1008      817920 :         h1_real = _AVX512_SET1(hh_dbl[0]);
    1009      817920 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
    1010             : 
    1011             : #ifdef DOUBLE_PRECISION_COMPLEX
    1012      654336 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
    1013      654336 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
    1014             : #endif
    1015             : #ifdef SINGLE_PRECISION_COMPLEX
    1016      163584 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
    1017      163584 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
    1018             : #endif
    1019             : 
    1020             : 
    1021      408960 :         tmp1 = _AVX512_MUL(h1_imag, x1);
    1022             : 
    1023      817920 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1024             : 
    1025      408960 :         tmp2 = _AVX512_MUL(h1_imag, x2);
    1026             : 
    1027      817920 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
    1028             : 
    1029      817920 :         h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
    1030      817920 :         h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
    1031      817920 :         h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
    1032      817920 :         h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
    1033             : 
    1034             : #ifdef DOUBLE_PRECISION_COMPLEX
    1035      654336 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
    1036      654336 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
    1037             : #endif
    1038             : #ifdef SINGLE_PRECISION_COMPLEX
    1039      163584 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
    1040      163584 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
    1041             : #endif
    1042             : 
    1043             : #ifdef DOUBLE_PRECISION_COMPLEX
    1044      654336 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
    1045      654336 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
    1046             : #endif
    1047             : #ifdef SINGLE_PRECISION_COMPLEX
    1048      163584 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
    1049      163584 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
    1050             : #endif
    1051             : 
    1052             : #ifdef DOUBLE_PRECISION_COMPLEX
    1053     1635840 :         tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
    1054      327168 :                              s_dbl[1], s_dbl[0],
    1055      327168 :                              s_dbl[1], s_dbl[0],
    1056      327168 :                              s_dbl[1], s_dbl[0]);
    1057             : #endif
    1058             : #ifdef SINGLE_PRECISION_COMPLEX
    1059      163584 :         tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
    1060             : #endif
    1061             : 
    1062      408960 :         tmp1 = _AVX512_MUL(h2_imag, tmp2);
    1063             : 
    1064      817920 :         tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1065             : 
    1066             :         _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
    1067             : 
    1068      817920 :         h2_real = _AVX512_SET1(s_dbl[0]);
    1069      817920 :         h2_imag = _AVX512_SET1(s_dbl[1]);
    1070             : 
    1071      408960 :         tmp1 = _AVX512_MUL(h1_imag, y1);
    1072             : 
    1073      817920 :         y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1074             : 
    1075      408960 :         tmp2 = _AVX512_MUL(h1_imag, y2);
    1076             : 
    1077      817920 :         y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
    1078             : 
    1079      408960 :         tmp1 = _AVX512_MUL(h2_imag, x1);
    1080             : 
    1081     1226880 :         y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1082             : 
    1083      408960 :         tmp2 = _AVX512_MUL(h2_imag, x2);
    1084             : 
    1085     1226880 :         y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1086             : 
    1087      408960 :         q1 = _AVX512_LOAD(&q_dbl[0]);
    1088      817920 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
    1089             : 
    1090      408960 :         q1 = _AVX512_ADD(q1, y1);
    1091      408960 :         q2 = _AVX512_ADD(q2, y2);
    1092             : 
    1093             :         _AVX512_STORE(&q_dbl[0], q1);
    1094      408960 :         _AVX512_STORE(&q_dbl[offset], q2);
    1095             : 
    1096      817920 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
    1097      817920 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
    1098             : 
    1099      817920 :         q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
    1100      817920 :         q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
    1101             : 
    1102      408960 :         q1 = _AVX512_ADD(q1, x1);
    1103      408960 :         q2 = _AVX512_ADD(q2, x2);
    1104             : 
    1105      408960 :         tmp1 = _AVX512_MUL(h2_imag, y1);
    1106             : 
    1107     1226880 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1108             : 
    1109      408960 :         tmp2 = _AVX512_MUL(h2_imag, y2);
    1110             : 
    1111     1226880 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1112             : 
    1113      408960 :         _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
    1114      408960 :         _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
    1115             : 
    1116    12677760 :         for (i = 2; i < nb; i++)
    1117             :         {
    1118    24537600 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
    1119    24537600 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
    1120             : 
    1121    24537600 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
    1122    24537600 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
    1123             : 
    1124    12268800 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
    1125             : 
    1126    36806400 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1127             : 
    1128    12268800 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
    1129             : 
    1130    36806400 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1131             : 
    1132    24537600 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
    1133    24537600 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
    1134             : 
    1135    12268800 :                 tmp1 = _AVX512_MUL(h2_imag, y1);
    1136             : 
    1137    36806400 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1138             : 
    1139    12268800 :                 tmp2 = _AVX512_MUL(h2_imag, y2);
    1140             : 
    1141    36806400 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1142             : 
    1143    12268800 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1144    12268800 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
    1145             :         }
    1146      817920 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
    1147      817920 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
    1148             : 
    1149      817920 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
    1150      817920 :         q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
    1151             : 
    1152      408960 :         tmp1 = _AVX512_MUL(h1_imag, x1);
    1153             : 
    1154     1226880 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1155             : 
    1156      408960 :         tmp2 = _AVX512_MUL(h1_imag, x2);
    1157             : 
    1158     1226880 :         q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1159             : 
    1160      408960 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
    1161      408960 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
    1162             : }
    1163             : 
    1164             : #ifdef DOUBLE_PRECISION_COMPLEX
    1165             : static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
    1166             : #endif
    1167             : #ifdef SINGLE_PRECISION_COMPLEX
    1168             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
    1169             : #endif
    1170             : {
    1171             : 
    1172             : #ifdef DOUBLE_PRECISION_COMPLEX
    1173           0 :         double* q_dbl = (double*)q;
    1174           0 :         double* hh_dbl = (double*)hh;
    1175           0 :         double* s_dbl = (double*)(&s);
    1176             : #endif
    1177             : #ifdef SINGLE_PRECISION_COMPLEX
    1178       81792 :         float* q_dbl = (float*)q;
    1179       81792 :         float* hh_dbl = (float*)hh;
    1180       81792 :         float* s_dbl = (float*)(&s);
    1181             : #endif
    1182             : 
    1183             :         __AVX512_DATATYPE x1, x2;
    1184             :         __AVX512_DATATYPE y1, y2;
    1185             :         __AVX512_DATATYPE q1, q2;
    1186             :         __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
    1187             :         __AVX512_DATATYPE tmp1, tmp2;
    1188       81792 :         int i=0;
    1189             : 
    1190             : #ifdef DOUBLE_PRECISION_COMPLEX
    1191           0 :        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
    1192             : #endif
    1193             : #ifdef SINGLE_PRECISION_COMPLEX
    1194       81792 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
    1195             : #endif
    1196             : 
    1197      163584 :         x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
    1198             : 
    1199      163584 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
    1200      163584 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
    1201             : 
    1202       81792 :         y1 = _AVX512_LOAD(&q_dbl[0]);
    1203             : 
    1204       81792 :         tmp1 = _AVX512_MUL(h2_imag, x1);
    1205             : 
    1206      245376 :         y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1207             : 
    1208     2535552 :         for (i = 2; i < nb; i++)
    1209             :         {
    1210     4907520 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
    1211     4907520 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
    1212             : 
    1213     4907520 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
    1214     4907520 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
    1215             : 
    1216     2453760 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
    1217             : 
    1218     7361280 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1219             : 
    1220     4907520 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
    1221     4907520 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
    1222             : 
    1223     2453760 :                 tmp1 = _AVX512_MUL(h2_imag, q1);
    1224             : 
    1225     7361280 :                 y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1226             : 
    1227             :         }
    1228             : 
    1229      163584 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
    1230      163584 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
    1231             : 
    1232      163584 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
    1233             : 
    1234       81792 :         tmp1 = _AVX512_MUL(h1_imag, q1);
    1235             : 
    1236      245376 :         x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1237             : 
    1238      163584 :         h1_real = _AVX512_SET1(hh_dbl[0]);
    1239      163584 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
    1240             : 
    1241             : #ifdef DOUBLE_PRECISION_COMPLEX
    1242           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
    1243           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
    1244             : #endif
    1245             : #ifdef SINGLE_PRECISION_COMPLEX
    1246      163584 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
    1247      163584 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
    1248             : #endif
    1249             : 
    1250             : 
    1251       81792 :         tmp1 = _AVX512_MUL(h1_imag, x1);
    1252             : 
    1253      163584 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1254             : 
    1255      163584 :         h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
    1256      163584 :         h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
    1257      163584 :         h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
    1258      163584 :         h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
    1259             : 
    1260             : #ifdef DOUBLE_PRECISION_COMPLEX
    1261           0 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
    1262           0 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
    1263             : #endif
    1264             : #ifdef SINGLE_PRECISION_COMPLEX
    1265      163584 :         h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
    1266      163584 :         h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
    1267             : #endif
    1268             : 
    1269             : #ifdef DOUBLE_PRECISION_COMPLEX
    1270           0 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
    1271           0 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
    1272             : #endif
    1273             : #ifdef SINGLE_PRECISION_COMPLEX
    1274      163584 :         h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
    1275      163584 :         h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
    1276             : #endif
    1277             : 
    1278             : #ifdef DOUBLE_PRECISION_COMPLEX
    1279           0 :         tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
    1280           0 :                              s_dbl[1], s_dbl[0],
    1281           0 :                              s_dbl[1], s_dbl[0],
    1282           0 :                              s_dbl[1], s_dbl[0]);
    1283             : #endif
    1284             : #ifdef SINGLE_PRECISION_COMPLEX
    1285      163584 :         tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
    1286             : #endif
    1287             : 
    1288       81792 :         tmp1 = _AVX512_MUL(h2_imag, tmp2);
    1289             : 
    1290      163584 :         tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1291             : 
    1292             :         _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
    1293             : 
    1294      163584 :         h2_real = _AVX512_SET1(s_dbl[0]);
    1295      163584 :         h2_imag = _AVX512_SET1(s_dbl[1]);
    1296             : 
    1297       81792 :         tmp1 = _AVX512_MUL(h1_imag, y1);
    1298             : 
    1299      163584 :         y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1300             : 
    1301       81792 :         tmp1 = _AVX512_MUL(h2_imag, x1);
    1302             : 
    1303      245376 :         y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1304             : 
    1305       81792 :         q1 = _AVX512_LOAD(&q_dbl[0]);
    1306             : 
    1307       81792 :         q1 = _AVX512_ADD(q1, y1);
    1308             : 
    1309             :         _AVX512_STORE(&q_dbl[0], q1);
    1310             : 
    1311      163584 :         h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
    1312      163584 :         h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
    1313             : 
    1314      163584 :         q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
    1315             : 
    1316       81792 :         q1 = _AVX512_ADD(q1, x1);
    1317             : 
    1318       81792 :         tmp1 = _AVX512_MUL(h2_imag, y1);
    1319             : 
    1320      245376 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1321             : 
    1322       81792 :         _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
    1323             : 
    1324     2535552 :         for (i = 2; i < nb; i++)
    1325             :         {
    1326     4907520 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
    1327             : 
    1328     4907520 :                 h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
    1329     4907520 :                 h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
    1330             : 
    1331     2453760 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
    1332             : 
    1333     7361280 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1334             : 
    1335     4907520 :                 h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
    1336     4907520 :                 h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
    1337             : 
    1338     2453760 :                 tmp1 = _AVX512_MUL(h2_imag, y1);
    1339             : 
    1340     7361280 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1341             : 
    1342     2453760 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1343             :         }
    1344      163584 :         h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
    1345      163584 :         h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
    1346             : 
    1347      163584 :         q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
    1348             : 
    1349       81792 :         tmp1 = _AVX512_MUL(h1_imag, x1);
    1350             : 
    1351      245376 :         q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1352             : 
    1353       81792 :         _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
    1354             : }
    1355             : 

Generated by: LCOV version 1.12