LCOV - coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info - src/elpa2/kernels/complex_avx512_1hv

LCOV - code coverage report

Current view:	top level - src/elpa2/kernels - complex_avx512_1hv_template.c (source / functions)		Hit	Total	Coverage
Test:	coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info	Lines:	267	425	62.8 %
Date:	2018-01-10 09:29:53	Functions:	2	2	100.0 %

          Line data    Source code

       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
      46             : // --------------------------------------------------------------------------------------------------
      47             : 
      48             : 
      49             : #include "config-f90.h"
      50             : 
      51             : #include <complex.h>
      52             : #include <x86intrin.h>
      53             : #include <stdio.h>
      54             : #include <stdlib.h>
      55             : 
      56             : #define __forceinline __attribute__((always_inline))
      57             : 
      58             : #ifdef DOUBLE_PRECISION_COMPLEX
      59             : #define __AVX512_DATATYPE __m512d
      60             : #define _AVX512_LOAD _mm512_load_pd
      61             : #define _AVX512_STORE _mm512_store_pd
      62             : #define _AVX512_SET1 _mm512_set1_pd
      63             : #define _AVX512_MUL _mm512_mul_pd
      64             : #define _AVX512_ADD _mm512_add_pd
      65             : #define _AVX512_SHUFFLE _mm512_shuffle_pd
      66             : #define _AVX512_XOR _mm512_xor_pd
      67             : #define _AVX512_XOR_EPI _mm512_xor_epi64
      68             : #define _SHUFFLE 0x55
      69             : 
      70             : #ifdef HAVE_AVX512
      71             : 
      72             : #define __ELPA_USE_FMA__
      73             : #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
      74             : #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
      75             : 
      76             : #endif
      77             : 
      78             : #define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
      79             : #define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
      80             : #endif /* DOUBLE_PRECISION_COMPLEX */
      81             : 
      82             : #ifdef SINGLE_PRECISION_COMPLEX
      83             : #define __AVX512_DATATYPE __m512
      84             : #define _AVX512_LOAD _mm512_load_ps
      85             : #define _AVX512_STORE _mm512_store_ps
      86             : #define _AVX512_SET1 _mm512_set1_ps
      87             : #define _AVX512_MUL _mm512_mul_ps
      88             : #define _AVX512_ADD _mm512_add_ps
      89             : #define _AVX512_SHUFFLE _mm512_shuffle_ps
      90             : #define _AVX512_XOR _mm512_xor_ps
      91             : #define _AVX512_XOR_EPI _mm512_xor_epi32
      92             : #define _SHUFFLE 0xb1
      93             : 
      94             : #ifdef HAVE_AVX512
      95             : 
      96             : #define __ELPA_USE_FMA__
      97             : #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
      98             : #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
      99             : 
     100             : #endif
     101             : 
     102             : #define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
     103             : #define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
     104             : #endif /* SINGLE_PRECISION_COMPLEX */
     105             : 
     106             : 
     107             : //Forward declaration
     108             : #ifdef DOUBLE_PRECISION_COMPLEX
     109             : static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     110             : static  __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     111             : static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     112             : static  __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     113             : static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     114             : static  __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     115             : #endif
     116             : 
     117             : #ifdef SINGLE_PRECISION_COMPLEX
     118             : static  __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     119             : static  __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     120             : static  __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     121             : static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     122             : static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     123             : static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     124             : #endif
     125             : 
     126             : 
     127             : /*
     128             : !f>#if defined(HAVE_AVX512)
     129             : !f> interface
     130             : !f>   subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
     131             : !f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
     132             : !f>     use, intrinsic :: iso_c_binding
     133             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq
     134             : !f>     ! complex(kind=c_double_complex)     :: q(*)
     135             : !f>     type(c_ptr), value                 :: q
     136             : !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
     137             : !f>   end subroutine
     138             : !f> end interface
     139             : !f>#endif
     140             : */
     141             : /*
     142             : !f>#if defined(HAVE_AVX512)
     143             : !f> interface
     144             : !f>   subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
     145             : !f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
     146             : !f>     use, intrinsic :: iso_c_binding
     147             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq
     148             : !f>     ! complex(kind=c_float_complex)     :: q(*)
     149             : !f>     type(c_ptr), value                  :: q
     150             : !f>     complex(kind=c_float_complex)     :: hh(pnb,2)
     151             : !f>   end subroutine
     152             : !f> end interface
     153             : !f>#endif
     154             : */
     155             : 
     156             : #ifdef DOUBLE_PRECISION_COMPLEX
     157    70951936 : void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
     158             : #endif
     159             : #ifdef SINGLE_PRECISION_COMPLEX
     160    24121856 : void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
     161             : #endif
     162             : {
     163             :         int i;
     164    95073792 :         int nb = *pnb;
     165    95073792 :         int nq = *pldq;
     166    95073792 :         int ldq = *pldq;
     167             :         int worked_on;
     168             :         //int ldh = *pldh;
     169             : 
     170    95073792 :         worked_on = 0;
     171             : 
     172             : #ifdef DOUBLE_PRECISION_COMPLEX
     173   210028544 :         for (i = 0; i < nq-20; i+=24)
     174             :         {
     175   139076608 :                 hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
     176   139076608 :                 worked_on += 24;
     177             :         }
     178             : #endif
     179             : 
     180             : #ifdef SINGLE_PRECISION_COMPLEX
     181    47536896 :         for (i = 0; i < nq-40; i+=48)
     182             :         {
     183    23415040 :                 hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
     184    23415040 :                 worked_on += 48;
     185             :         }
     186             : #endif
     187    95073792 :         if (nq == i)
     188             :         {
     189    91539712 :                 return;
     190             :         }
     191             : 
     192             : #ifdef DOUBLE_PRECISION_COMPLEX
     193     2827264 :         if (nq-i == 20)
     194             :         {
     195           0 :                 hh_trafo_complex_kernel_20_AVX512_1hv_double(&q[i], hh, nb, ldq);
     196           0 :                 worked_on += 20;
     197             :         }
     198             : #endif
     199             : 
     200             : #ifdef SINGLE_PRECISION_COMPLEX
     201      706816 :         if (nq-i == 40)
     202             :         {
     203      706816 :                 hh_trafo_complex_kernel_40_AVX512_1hv_single(&q[i], hh, nb, ldq);
     204      706816 :                 worked_on += 40;
     205             :         }
     206             : #endif
     207             : 
     208             : #ifdef DOUBLE_PRECISION_COMPLEX
     209     2827264 :         if (nq-i == 16)
     210             :         {
     211     2827264 :                 hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
     212     2827264 :                 worked_on += 16;
     213             :         }
     214             : #endif
     215             : 
     216             : #ifdef SINGLE_PRECISION_COMPLEX
     217      706816 :         if (nq-i == 32)
     218             :         {
     219           0 :                 hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
     220           0 :                 worked_on += 32;
     221             :         }
     222             : #endif
     223             : 
     224             : #ifdef DOUBLE_PRECISION_COMPLEX
     225     2827264 :         if (nq-i == 12)
     226             :         {
     227           0 :                 hh_trafo_complex_kernel_12_AVX512_1hv_double(&q[i], hh, nb, ldq);
     228           0 :                 worked_on += 12;
     229             :         }
     230             : #endif
     231             : 
     232             : #ifdef SINGLE_PRECISION_COMPLEX
     233      706816 :         if (nq-i == 24)
     234             :         {
     235           0 :                 hh_trafo_complex_kernel_24_AVX512_1hv_single(&q[i], hh, nb, ldq);
     236           0 :                 worked_on += 24;
     237             :         }
     238             : #endif
     239             : 
     240             : #ifdef DOUBLE_PRECISION_COMPLEX
     241     2827264 :         if (nq-i == 8)
     242             :         {
     243           0 :                 hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
     244           0 :                 worked_on += 8;
     245             :         }
     246             : #endif
     247             : 
     248             : #ifdef SINGLE_PRECISION_COMPLEX
     249      706816 :         if (nq-i == 16)
     250             :         {
     251           0 :                 hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
     252           0 :                 worked_on += 16;
     253             :         }
     254             : #endif
     255             : 
     256             : #ifdef DOUBLE_PRECISION_COMPLEX
     257     2827264 :         if (nq-i == 4)
     258             :         {
     259           0 :                 hh_trafo_complex_kernel_4_AVX512_1hv_double(&q[i], hh, nb, ldq);
     260           0 :                 worked_on += 4;
     261             :         }
     262             : #endif
     263             : 
     264             : #ifdef SINGLE_PRECISION_COMPLEX
     265      706816 :         if (nq-i == 8)
     266             :         {
     267           0 :                 hh_trafo_complex_kernel_8_AVX512_1hv_single(&q[i], hh, nb, ldq);
     268           0 :                 worked_on += 8;
     269             :         }
     270             : #endif
     271             : #ifdef WITH_DEBUG
     272             :         if (worked_on != nq)
     273             :         {
     274             :              printf("Error in complex AVX512 BLOCK 1 kernel \n");
     275             :              abort();
     276             :         }
     277             : #endif
     278             : }
     279             : 
     280             : #ifdef DOUBLE_PRECISION_COMPLEX
     281             : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     282             : #endif
     283             : #ifdef SINGLE_PRECISION_COMPLEX
     284             : static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     285             : #endif
     286             : {
     287             : 
     288             : #ifdef DOUBLE_PRECISION_COMPLEX
     289   139076608 :         double* q_dbl = (double*)q;
     290   139076608 :         double* hh_dbl = (double*)hh;
     291             : #endif
     292             : #ifdef SINGLE_PRECISION_COMPLEX
     293    23415040 :         float* q_dbl = (float*)q;
     294    23415040 :         float* hh_dbl = (float*)hh;
     295             : #endif
     296             :         __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
     297             :         __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
     298             :         __AVX512_DATATYPE h1_real, h1_imag;
     299             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
     300   162491648 :         int i=0;
     301             : 
     302             : #ifdef DOUBLE_PRECISION_COMPLEX
     303   139076608 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     304             : #endif
     305             : #ifdef SINGLE_PRECISION_COMPLEX
     306    23415040 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     307             : #endif
     308             : 
     309             : #ifdef DOUBLE_PRECISION_COMPLEX
     310             : #define offset 8
     311             : #endif
     312             : #ifdef SINGLE_PRECISION_COMPLEX
     313             : #define offset 16
     314             : #endif
     315             : 
     316             : 
     317   162491648 :         x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
     318   324983296 :         x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
     319   324983296 :         x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
     320   324983296 :         x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
     321   324983296 :         x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
     322   324983296 :         x6 = _AVX512_LOAD(&q_dbl[5*offset]);   // complex 21, 22, 23, 24
     323             : 
     324  5191745536 :         for (i = 1; i < nb; i++)
     325             :         {
     326 10058507776 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     327 10058507776 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     328             : 
     329 10058507776 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     330 10058507776 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     331 10058507776 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     332 10058507776 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     333 10058507776 :                 q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     334 10058507776 :                 q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
     335             : 
     336  5029253888 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     337             : 
     338 15087761664 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     339             : 
     340  5029253888 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     341             : 
     342 15087761664 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     343             : 
     344  5029253888 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     345             : 
     346 15087761664 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     347             : 
     348  5029253888 :                 tmp4 = _AVX512_MUL(h1_imag, q4);
     349             : 
     350 15087761664 :                 x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     351             : 
     352  5029253888 :                 tmp5 = _AVX512_MUL(h1_imag, q5);
     353             : 
     354 15087761664 :                 x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     355             : 
     356  5029253888 :                 tmp6 = _AVX512_MUL(h1_imag, q6);
     357             : 
     358 15087761664 :                 x6 = _AVX512_ADD(x6, _AVX512_FMSUBADD(h1_real, q6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     359             :         }
     360             : 
     361   324983296 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     362   324983296 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     363             : 
     364             : #ifdef DOUBLE_PRECISION_COMPLEX
     365   278153216 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     366   278153216 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     367             : #endif
     368             : #ifdef SINGLE_PRECISION_COMPLEX
     369    46830080 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     370    46830080 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     371             : #endif
     372             : 
     373   162491648 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     374             : 
     375   324983296 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     376             : 
     377   162491648 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     378             : 
     379   324983296 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     380             : 
     381   162491648 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     382             : 
     383   324983296 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     384             : 
     385   162491648 :         tmp4 = _AVX512_MUL(h1_imag, x4);
     386             : 
     387   324983296 :         x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     388             : 
     389   162491648 :         tmp5 = _AVX512_MUL(h1_imag, x5);
     390             : 
     391   324983296 :         x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     392             : 
     393   162491648 :         tmp6 = _AVX512_MUL(h1_imag, x6);
     394             : 
     395   324983296 :         x6 = _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE));
     396             : 
     397   162491648 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     398   324983296 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     399   324983296 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     400   324983296 :         q4 = _AVX512_LOAD(&q_dbl[3*offset]);
     401   324983296 :         q5 = _AVX512_LOAD(&q_dbl[4*offset]);
     402   324983296 :         q6 = _AVX512_LOAD(&q_dbl[5*offset]);
     403             : 
     404   162491648 :         q1 = _AVX512_ADD(q1, x1);
     405   162491648 :         q2 = _AVX512_ADD(q2, x2);
     406   162491648 :         q3 = _AVX512_ADD(q3, x3);
     407   162491648 :         q4 = _AVX512_ADD(q4, x4);
     408   162491648 :         q5 = _AVX512_ADD(q5, x5);
     409   162491648 :         q6 = _AVX512_ADD(q6, x6);
     410             : 
     411             :         _AVX512_STORE(&q_dbl[0], q1);
     412   162491648 :         _AVX512_STORE(&q_dbl[offset], q2);
     413   162491648 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     414   162491648 :         _AVX512_STORE(&q_dbl[3*offset], q4);
     415   162491648 :         _AVX512_STORE(&q_dbl[4*offset], q5);
     416   162491648 :         _AVX512_STORE(&q_dbl[5*offset], q6);
     417             : 
     418  5191745536 :         for (i = 1; i < nb; i++)
     419             :         {
     420 10058507776 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     421 10058507776 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     422             : 
     423 10058507776 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     424 10058507776 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     425 10058507776 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     426 10058507776 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     427 10058507776 :                 q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     428 10058507776 :                 q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
     429             : 
     430  5029253888 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     431             : 
     432 15087761664 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     433             : 
     434  5029253888 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     435             : 
     436 15087761664 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     437             : 
     438  5029253888 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     439             : 
     440 15087761664 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     441             : 
     442  5029253888 :                 tmp4 = _AVX512_MUL(h1_imag, x4);
     443             : 
     444 15087761664 :                 q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     445             : 
     446  5029253888 :                 tmp5 = _AVX512_MUL(h1_imag, x5);
     447             : 
     448 15087761664 :                 q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     449             : 
     450  5029253888 :                 tmp6 = _AVX512_MUL(h1_imag, x6);
     451             : 
     452 15087761664 :                 q6 = _AVX512_ADD(q6, _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     453             : 
     454  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     455  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     456  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     457  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     458  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
     459  5029253888 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
     460             :         }
     461             : }
     462             : 
     463             : #ifdef DOUBLE_PRECISION_COMPLEX
     464             : static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     465             : #endif
     466             : #ifdef SINGLE_PRECISION_COMPLEX
     467             : static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     468             : #endif
     469             : {
     470             : 
     471             : #ifdef DOUBLE_PRECISION_COMPLEX
     472           0 :         double* q_dbl = (double*)q;
     473           0 :         double* hh_dbl = (double*)hh;
     474             : #endif
     475             : #ifdef SINGLE_PRECISION_COMPLEX
     476      706816 :         float* q_dbl = (float*)q;
     477      706816 :         float* hh_dbl = (float*)hh;
     478             : #endif
     479             :         __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
     480             :         __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
     481             :         __AVX512_DATATYPE h1_real, h1_imag;
     482             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
     483      706816 :         int i=0;
     484             : 
     485             : #ifdef DOUBLE_PRECISION_COMPLEX
     486           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     487             : #endif
     488             : #ifdef SINGLE_PRECISION_COMPLEX
     489      706816 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     490             : #endif
     491             : 
     492             : #ifdef DOUBLE_PRECISION_COMPLEX
     493             : #define offset 8
     494             : #endif
     495             : #ifdef SINGLE_PRECISION_COMPLEX
     496             : #define offset 16
     497             : #endif
     498             : 
     499             : 
     500      706816 :         x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
     501     1413632 :         x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
     502     1413632 :         x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
     503     1413632 :         x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
     504     1413632 :         x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
     505             : 
     506    21020672 :         for (i = 1; i < nb; i++)
     507             :         {
     508    40627712 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     509    40627712 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     510             : 
     511    40627712 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     512    40627712 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     513    40627712 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     514    40627712 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     515    40627712 :                 q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     516             : 
     517    20313856 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     518             : 
     519    60941568 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     520             : 
     521    20313856 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     522             : 
     523    60941568 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     524             : 
     525    20313856 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     526             : 
     527    60941568 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     528             : 
     529    20313856 :                 tmp4 = _AVX512_MUL(h1_imag, q4);
     530             : 
     531    60941568 :                 x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     532             : 
     533    20313856 :                 tmp5 = _AVX512_MUL(h1_imag, q5);
     534             : 
     535    60941568 :                 x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     536             : 
     537             :         }
     538             : 
     539     1413632 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     540     1413632 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     541             : 
     542             : #ifdef DOUBLE_PRECISION_COMPLEX
     543           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     544           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     545             : #endif
     546             : #ifdef SINGLE_PRECISION_COMPLEX
     547     1413632 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     548     1413632 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     549             : #endif
     550             : 
     551      706816 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     552             : 
     553     1413632 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     554             : 
     555      706816 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     556             : 
     557     1413632 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     558             : 
     559      706816 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     560             : 
     561     1413632 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     562             : 
     563      706816 :         tmp4 = _AVX512_MUL(h1_imag, x4);
     564             : 
     565     1413632 :         x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     566             : 
     567      706816 :         tmp5 = _AVX512_MUL(h1_imag, x5);
     568             : 
     569     1413632 :         x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     570             : 
     571      706816 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     572     1413632 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     573     1413632 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     574     1413632 :         q4 = _AVX512_LOAD(&q_dbl[3*offset]);
     575     1413632 :         q5 = _AVX512_LOAD(&q_dbl[4*offset]);
     576             : 
     577      706816 :         q1 = _AVX512_ADD(q1, x1);
     578      706816 :         q2 = _AVX512_ADD(q2, x2);
     579      706816 :         q3 = _AVX512_ADD(q3, x3);
     580      706816 :         q4 = _AVX512_ADD(q4, x4);
     581      706816 :         q5 = _AVX512_ADD(q5, x5);
     582             : 
     583             :         _AVX512_STORE(&q_dbl[0], q1);
     584      706816 :         _AVX512_STORE(&q_dbl[offset], q2);
     585      706816 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     586      706816 :         _AVX512_STORE(&q_dbl[3*offset], q4);
     587      706816 :         _AVX512_STORE(&q_dbl[4*offset], q5);
     588             : 
     589    21020672 :         for (i = 1; i < nb; i++)
     590             :         {
     591    40627712 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     592    40627712 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     593             : 
     594    40627712 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     595    40627712 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     596    40627712 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     597    40627712 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     598    40627712 :                 q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     599             : 
     600    20313856 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     601             : 
     602    60941568 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     603             : 
     604    20313856 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     605             : 
     606    60941568 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     607             : 
     608    20313856 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     609             : 
     610    60941568 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     611             : 
     612    20313856 :                 tmp4 = _AVX512_MUL(h1_imag, x4);
     613             : 
     614    60941568 :                 q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     615             : 
     616    20313856 :                 tmp5 = _AVX512_MUL(h1_imag, x5);
     617             : 
     618    60941568 :                 q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     619             : 
     620    20313856 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     621    20313856 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     622    20313856 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     623    20313856 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     624    20313856 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
     625             :         }
     626             : }
     627             : 
     628             : 
     629             : #ifdef DOUBLE_PRECISION_COMPLEX
     630             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     631             : #endif
     632             : #ifdef SINGLE_PRECISION_COMPLEX
     633             : static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     634             : #endif
     635             : {
     636             : 
     637             : #ifdef DOUBLE_PRECISION_COMPLEX
     638     2827264 :         double* q_dbl = (double*)q;
     639     2827264 :         double* hh_dbl = (double*)hh;
     640             : #endif
     641             : #ifdef SINGLE_PRECISION_COMPLEX
     642           0 :         float* q_dbl = (float*)q;
     643           0 :         float* hh_dbl = (float*)hh;
     644             : #endif
     645             : 
     646             :         __AVX512_DATATYPE x1, x2, x3, x4;
     647             :         __AVX512_DATATYPE q1, q2, q3, q4;
     648             :         __AVX512_DATATYPE h1_real, h1_imag;
     649             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
     650     2827264 :         int i=0;
     651             : 
     652             : #ifdef DOUBLE_PRECISION_COMPLEX
     653     2827264 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     654             : #endif
     655             : #ifdef SINGLE_PRECISION_COMPLEX
     656           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     657             : #endif
     658             : 
     659     2827264 :         x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
     660     5654528 :         x2 = _AVX512_LOAD(&q_dbl[offset]);
     661     5654528 :         x3 = _AVX512_LOAD(&q_dbl[2*offset]);
     662     5654528 :         x4 = _AVX512_LOAD(&q_dbl[3*offset]);  // comlex 13 14 15 16
     663             : 
     664    84082688 :         for (i = 1; i < nb; i++)
     665             :         {
     666   162510848 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     667   162510848 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     668             : 
     669   162510848 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     670   162510848 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     671   162510848 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     672   162510848 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     673             : 
     674    81255424 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     675             : 
     676   243766272 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     677             : 
     678    81255424 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     679             : 
     680   243766272 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     681             : 
     682    81255424 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     683             : 
     684   243766272 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     685             : 
     686    81255424 :                 tmp4 = _AVX512_MUL(h1_imag, q4);
     687             : 
     688   243766272 :                 x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     689             :         }
     690             : 
     691     5654528 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     692     5654528 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     693             : 
     694             : #ifdef DOUBLE_PRECISION_COMPLEX
     695     5654528 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     696     5654528 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     697             : #endif
     698             : #ifdef SINGLE_PRECISION_COMPLEX
     699           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     700           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     701             : #endif
     702             : 
     703     2827264 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     704             : 
     705     5654528 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     706             : 
     707     2827264 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     708             : 
     709     5654528 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     710             : 
     711     2827264 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     712             : 
     713     5654528 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     714             : 
     715     2827264 :         tmp4 = _AVX512_MUL(h1_imag, x4);
     716             : 
     717     5654528 :         x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     718             : 
     719     2827264 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     720     5654528 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     721     5654528 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     722     5654528 :         q4 = _AVX512_LOAD(&q_dbl[3*offset]);
     723             : 
     724     2827264 :         q1 = _AVX512_ADD(q1, x1);
     725     2827264 :         q2 = _AVX512_ADD(q2, x2);
     726     2827264 :         q3 = _AVX512_ADD(q3, x3);
     727     2827264 :         q4 = _AVX512_ADD(q4, x4);
     728             : 
     729             :         _AVX512_STORE(&q_dbl[0], q1);
     730     2827264 :         _AVX512_STORE(&q_dbl[offset], q2);
     731     2827264 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     732     2827264 :         _AVX512_STORE(&q_dbl[3*offset], q4);
     733             : 
     734    84082688 :         for (i = 1; i < nb; i++)
     735             :         {
     736   162510848 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     737   162510848 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     738             : 
     739   162510848 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     740   162510848 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     741   162510848 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     742   162510848 :                 q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     743             : 
     744    81255424 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     745             : 
     746   243766272 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     747             : 
     748    81255424 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     749             : 
     750   243766272 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     751             : 
     752    81255424 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     753             : 
     754   243766272 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     755             : 
     756    81255424 :                 tmp4 = _AVX512_MUL(h1_imag, x4);
     757             : 
     758   243766272 :                 q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     759             : 
     760    81255424 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     761    81255424 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     762    81255424 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     763    81255424 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     764             :         }
     765             : }
     766             : 
     767             : #ifdef DOUBLE_PRECISION_COMPLEX
     768             : static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     769             : #endif
     770             : #ifdef SINGLE_PRECISION_COMPLEX
     771             : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     772             : #endif
     773             : {
     774             : 
     775             : #ifdef DOUBLE_PRECISION_COMPLEX
     776           0 :         double* q_dbl = (double*)q;
     777           0 :         double* hh_dbl = (double*)hh;
     778             : #endif
     779             : #ifdef SINGLE_PRECISION_COMPLEX
     780           0 :         float* q_dbl = (float*)q;
     781           0 :         float* hh_dbl = (float*)hh;
     782             : #endif
     783             : 
     784             :         __AVX512_DATATYPE x1, x2, x3, x4;
     785             :         __AVX512_DATATYPE q1, q2, q3, q4;
     786             :         __AVX512_DATATYPE h1_real, h1_imag;
     787             :         __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
     788           0 :         int i=0;
     789             : 
     790             : #ifdef DOUBLE_PRECISION_COMPLEX
     791           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     792             : #endif
     793             : #ifdef SINGLE_PRECISION_COMPLEX
     794           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     795             : #endif
     796             : 
     797           0 :         x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
     798           0 :         x2 = _AVX512_LOAD(&q_dbl[offset]);
     799           0 :         x3 = _AVX512_LOAD(&q_dbl[2*offset]);
     800             : 
     801           0 :         for (i = 1; i < nb; i++)
     802             :         {
     803           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     804           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     805             : 
     806           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     807           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     808           0 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     809             : 
     810           0 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     811             : 
     812           0 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     813             : 
     814           0 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     815             : 
     816           0 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     817             : 
     818           0 :                 tmp3 = _AVX512_MUL(h1_imag, q3);
     819             : 
     820           0 :                 x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     821             : 
     822             :         }
     823             : 
     824           0 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     825           0 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     826             : 
     827             : #ifdef DOUBLE_PRECISION_COMPLEX
     828           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     829           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     830             : #endif
     831             : #ifdef SINGLE_PRECISION_COMPLEX
     832           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     833           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     834             : #endif
     835             : 
     836           0 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     837             : 
     838           0 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     839             : 
     840           0 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     841             : 
     842           0 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     843             : 
     844           0 :         tmp3 = _AVX512_MUL(h1_imag, x3);
     845             : 
     846           0 :         x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     847             : 
     848           0 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     849           0 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     850           0 :         q3 = _AVX512_LOAD(&q_dbl[2*offset]);
     851             : 
     852           0 :         q1 = _AVX512_ADD(q1, x1);
     853           0 :         q2 = _AVX512_ADD(q2, x2);
     854           0 :         q3 = _AVX512_ADD(q3, x3);
     855             : 
     856             :         _AVX512_STORE(&q_dbl[0], q1);
     857           0 :         _AVX512_STORE(&q_dbl[offset], q2);
     858           0 :         _AVX512_STORE(&q_dbl[2*offset], q3);
     859             : 
     860           0 :         for (i = 1; i < nb; i++)
     861             :         {
     862           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     863           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     864             : 
     865           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     866           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     867           0 :                 q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     868             : 
     869           0 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     870             : 
     871           0 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     872             : 
     873           0 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     874             : 
     875           0 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     876             : 
     877           0 :                 tmp3 = _AVX512_MUL(h1_imag, x3);
     878             : 
     879           0 :                 q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     880             : 
     881           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     882           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     883           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     884             :         }
     885             : }
     886             : 
     887             : 
     888             : #ifdef DOUBLE_PRECISION_COMPLEX
     889             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     890             : #endif
     891             : #ifdef SINGLE_PRECISION_COMPLEX
     892             : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     893             : #endif
     894             : {
     895             : 
     896             : #ifdef DOUBLE_PRECISION_COMPLEX
     897           0 :         double* q_dbl = (double*)q;
     898           0 :         double* hh_dbl = (double*)hh;
     899             : #endif
     900             : #ifdef SINGLE_PRECISION_COMPLEX
     901           0 :         float* q_dbl = (float*)q;
     902           0 :         float* hh_dbl = (float*)hh;
     903             : #endif
     904             :         __AVX512_DATATYPE x1, x2;
     905             :         __AVX512_DATATYPE q1, q2;
     906             :         __AVX512_DATATYPE h1_real, h1_imag;
     907             :         __AVX512_DATATYPE tmp1, tmp2;
     908           0 :         int i=0;
     909             : 
     910             : #ifdef DOUBLE_PRECISION_COMPLEX
     911           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     912             : #endif
     913             : #ifdef SINGLE_PRECISION_COMPLEX
     914           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
     915             : #endif
     916             : 
     917           0 :         x1 = _AVX512_LOAD(&q_dbl[0]);
     918           0 :         x2 = _AVX512_LOAD(&q_dbl[offset]);
     919             : 
     920           0 :         for (i = 1; i < nb; i++)
     921             :         {
     922           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     923           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     924             : 
     925           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     926           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     927             : 
     928           0 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
     929           0 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     930           0 :                 tmp2 = _AVX512_MUL(h1_imag, q2);
     931           0 :                 x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     932             :         }
     933             : 
     934           0 :         h1_real = _AVX512_SET1(hh_dbl[0]);
     935           0 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
     936             : 
     937             : #ifdef DOUBLE_PRECISION_COMPLEX
     938           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     939           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     940             : #endif
     941             : #ifdef SINGLE_PRECISION_COMPLEX
     942           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
     943           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
     944             : #endif
     945             : 
     946           0 :         tmp1 = _AVX512_MUL(h1_imag, x1);
     947           0 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     948             : 
     949           0 :         tmp2 = _AVX512_MUL(h1_imag, x2);
     950             : 
     951           0 :         x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     952             : 
     953           0 :         q1 = _AVX512_LOAD(&q_dbl[0]);
     954           0 :         q2 = _AVX512_LOAD(&q_dbl[offset]);
     955             : 
     956           0 :         q1 = _AVX512_ADD(q1, x1);
     957           0 :         q2 = _AVX512_ADD(q2, x2);
     958             : 
     959             :         _AVX512_STORE(&q_dbl[0], q1);
     960           0 :         _AVX512_STORE(&q_dbl[offset], q2);
     961             : 
     962           0 :         for (i = 1; i < nb; i++)
     963             :         {
     964           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
     965           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
     966             : 
     967           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
     968           0 :                 q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
     969             : 
     970           0 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
     971           0 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     972             : 
     973           0 :                 tmp2 = _AVX512_MUL(h1_imag, x2);
     974             : 
     975           0 :                 q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     976             : 
     977           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
     978           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     979             :         }
     980             : }
     981             : 
     982             : 
     983             : #ifdef DOUBLE_PRECISION_COMPLEX
     984             : static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     985             : #endif
     986             : #ifdef SINGLE_PRECISION_COMPLEX
     987             : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     988             : #endif
     989             : {
     990             : 
     991             : #ifdef DOUBLE_PRECISION_COMPLEX
     992           0 :         double* q_dbl = (double*)q;
     993           0 :         double* hh_dbl = (double*)hh;
     994             : #endif
     995             : #ifdef SINGLE_PRECISION_COMPLEX
     996           0 :         float* q_dbl = (float*)q;
     997           0 :         float* hh_dbl = (float*)hh;
     998             : #endif
     999             :         __AVX512_DATATYPE x1, x2;
    1000             :         __AVX512_DATATYPE q1, q2;
    1001             :         __AVX512_DATATYPE h1_real, h1_imag;
    1002             :         __AVX512_DATATYPE tmp1, tmp2;
    1003           0 :         int i=0;
    1004             : 
    1005             : #ifdef DOUBLE_PRECISION_COMPLEX
    1006           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    1007             : #endif
    1008             : #ifdef SINGLE_PRECISION_COMPLEX
    1009           0 :         __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
    1010             : #endif
    1011             : 
    1012           0 :         x1 = _AVX512_LOAD(&q_dbl[0]);
    1013             : 
    1014           0 :         for (i = 1; i < nb; i++)
    1015             :         {
    1016           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
    1017           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
    1018             : 
    1019           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
    1020             : 
    1021           0 :                 tmp1 = _AVX512_MUL(h1_imag, q1);
    1022           0 :                 x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1023             :         }
    1024             : 
    1025           0 :         h1_real = _AVX512_SET1(hh_dbl[0]);
    1026           0 :         h1_imag = _AVX512_SET1(hh_dbl[1]);
    1027             : 
    1028             : #ifdef DOUBLE_PRECISION_COMPLEX
    1029           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
    1030           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
    1031             : #endif
    1032             : #ifdef SINGLE_PRECISION_COMPLEX
    1033           0 :         h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
    1034           0 :         h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
    1035             : #endif
    1036             : 
    1037           0 :         tmp1 = _AVX512_MUL(h1_imag, x1);
    1038           0 :         x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1039             : 
    1040           0 :         q1 = _AVX512_LOAD(&q_dbl[0]);
    1041             : 
    1042           0 :         q1 = _AVX512_ADD(q1, x1);
    1043             : 
    1044             :         _AVX512_STORE(&q_dbl[0], q1);
    1045             : 
    1046           0 :         for (i = 1; i < nb; i++)
    1047             :         {
    1048           0 :                 h1_real = _AVX512_SET1(hh_dbl[i*2]);
    1049           0 :                 h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
    1050             : 
    1051           0 :                 q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
    1052             : 
    1053           0 :                 tmp1 = _AVX512_MUL(h1_imag, x1);
    1054           0 :                 q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1055             : 
    1056           0 :                 _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1057             :         }
    1058             : }
    1059             :

Generated by: LCOV version 1.12