LCOV - code coverage report
Current view: top level - src/elpa2/kernels - complex_avx-avx2_1hv_template.c (source / functions) Hit Total Coverage
Test: coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info Lines: 226 413 54.7 %
Date: 2018-01-10 09:29:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : //    This file is part of ELPA.
       2             : //
       3             : //    The ELPA library was originally created by the ELPA consortium,
       4             : //    consisting of the following organizations:
       5             : //
       6             : //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : //      Informatik,
      10             : //    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : //      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : //      and
      16             : //    - IBM Deutschland GmbH
      17             : //
      18             : //    This particular source code file contains additions, changes and
      19             : //    enhancements authored by Intel Corporation which is not part of
      20             : //    the ELPA consortium.
      21             : //
      22             : //    More information can be found here:
      23             : //    http://elpa.mpcdf.mpg.de/
      24             : //
      25             : //    ELPA is free software: you can redistribute it and/or modify
      26             : //    it under the terms of the version 3 of the license of the
      27             : //    GNU Lesser General Public License as published by the Free
      28             : //    Software Foundation.
      29             : //
      30             : //    ELPA is distributed in the hope that it will be useful,
      31             : //    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : //    GNU Lesser General Public License for more details.
      34             : //
      35             : //    You should have received a copy of the GNU Lesser General Public License
      36             : //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : //
      38             : //    ELPA reflects a substantial effort on the part of the original
      39             : //    ELPA consortium, and we ask you to respect the spirit of the
      40             : //    license that we chose: i.e., please contribute any changes you
      41             : //    may have back to the original ELPA library distribution, and keep
      42             : //    any derivatives of ELPA under the same license that we chose for
      43             : //    the original distribution, the GNU Lesser General Public License.
      44             : //
      45             : //
      46             : // --------------------------------------------------------------------------------------------------
      47             : //
      48             : // This file contains the compute intensive kernels for the Householder transformations.
      49             : // It should be compiled with the highest possible optimization level.
      50             : //
      51             : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
      52             : // On Intel Sandy Bridge use -O3 -mavx
      53             : //
      54             : // Copyright of the original code rests with the authors inside the ELPA
      55             : // consortium. The copyright of any additional modifications shall rest
      56             : // with their original authors, but shall adhere to the licensing terms
      57             : // distributed along with the original code in the file "COPYING".
      58             : //
      59             : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
      60             : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
      61             : // --------------------------------------------------------------------------------------------------
      62             : #include "config-f90.h"
      63             : 
      64             : #include <complex.h>
      65             : #include <x86intrin.h>
      66             : #include <stdio.h>
      67             : #include <stdlib.h>
      68             : 
      69             : #define __forceinline __attribute__((always_inline))
      70             : 
      71             : #ifdef DOUBLE_PRECISION_COMPLEX
      72             : #define offset 4
      73             : #define __AVX_DATATYPE __m256d
      74             : #define _AVX_LOAD _mm256_load_pd
      75             : #define _AVX_STORE _mm256_store_pd
      76             : #define _AVX_ADD _mm256_add_pd
      77             : #define _AVX_MUL _mm256_mul_pd
      78             : #define _AVX_ADDSUB _mm256_addsub_pd
      79             : #define _AVX_XOR _mm256_xor_pd
      80             : #define _AVX_BROADCAST _mm256_broadcast_sd
      81             : #define _AVX_SHUFFLE _mm256_shuffle_pd
      82             : #define _SHUFFLE 0x5
      83             : 
      84             : #ifdef HAVE_AVX2
      85             : 
      86             : #ifdef __FMA4__
      87             : #define __ELPA_USE_FMA__
      88             : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
      89             : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
      90             : #endif
      91             : 
      92             : #ifdef __AVX2__
      93             : #define __ELPA_USE_FMA__
      94             : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
      95             : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
      96             : #endif
      97             : 
      98             : #endif
      99             : 
     100             : #define _AVX_FMADDSUB _mm256_FMADDSUB_pd
     101             : #define _AVX_FMSUBADD _mm256_FMSUBADD_pd
     102             : 
     103             : #endif /* DOUBLE_PRECISION_COMPLEX */
     104             : 
     105             : #ifdef SINGLE_PRECISION_COMPLEX
     106             : #define offset 8
     107             : #define __AVX_DATATYPE __m256
     108             : #define _AVX_LOAD _mm256_load_ps
     109             : #define _AVX_STORE _mm256_store_ps
     110             : #define _AVX_ADD _mm256_add_ps
     111             : #define _AVX_MUL _mm256_mul_ps
     112             : #define _AVX_ADDSUB _mm256_addsub_ps
     113             : #define _AVX_XOR _mm256_xor_ps
     114             : #define _AVX_BROADCAST _mm256_broadcast_ss
     115             : #define _AVX_SHUFFLE _mm256_shuffle_ps
     116             : #define _SHUFFLE 0xb1
     117             : 
     118             : #ifdef HAVE_AVX2
     119             : 
     120             : #ifdef __FMA4__
     121             : #define __ELPA_USE_FMA__
     122             : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
     123             : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
     124             : #endif
     125             : 
     126             : #ifdef __AVX2__
     127             : #define __ELPA_USE_FMA__
     128             : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
     129             : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
     130             : #endif
     131             : 
     132             : #endif
     133             : 
     134             : #define _AVX_FMADDSUB _mm256_FMADDSUB_ps
     135             : #define _AVX_FMSUBADD _mm256_FMSUBADD_ps
     136             : #endif /* SINGLE_PRECISION_COMPLEX */
     137             : 
     138             : #ifdef DOUBLE_PRECISION_COMPLEX
     139             : //Forward declaration
     140             : static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     141             : static  __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     142             : static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     143             : static  __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     144             : static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     145             : static  __forceinline void hh_trafo_complex_kernel_2_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
     146             : #endif
     147             : #ifdef SINGLE_PRECISION_COMPLEX
     148             : //Forward declaration
     149             : static  __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     150             : static  __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     151             : static  __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     152             : static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     153             : static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     154             : static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
     155             : #endif
     156             : 
     157             : #ifdef DOUBLE_PRECISION_COMPLEX
     158             : /*
     159             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     160             : !f> interface
     161             : !f>   subroutine single_hh_trafo_complex_avx_avx2_1hv_double(q, hh, pnb, pnq, pldq) &
     162             : !f>                             bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_double")
     163             : !f>     use, intrinsic :: iso_c_binding
     164             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq
     165             : !f>     ! complex(kind=c_double_complex)     :: q(*)
     166             : !f>     type(c_ptr), value                   :: q
     167             : !f>     complex(kind=c_double_complex)       :: hh(pnb,2)
     168             : !f>   end subroutine
     169             : !f> end interface
     170             : !f>#endif
     171             : */
     172             : #endif
     173             : #ifdef SINGLE_PRECISION_COMPLEX
     174             : /*
     175             : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
     176             : !f> interface
     177             : !f>   subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
     178             : !f>                             bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
     179             : !f>     use, intrinsic :: iso_c_binding
     180             : !f>     integer(kind=c_int)     :: pnb, pnq, pldq
     181             : !f>     ! complex(kind=c_float_complex)   :: q(*)
     182             : !f>     type(c_ptr), value              :: q
     183             : !f>     complex(kind=c_float_complex)   :: hh(pnb,2)
     184             : !f>   end subroutine
     185             : !f> end interface
     186             : !f>#endif
     187             : */
     188             : #endif
     189             : 
     190             : #ifdef DOUBLE_PRECISION_COMPLEX
     191    72951808 : void single_hh_trafo_complex_avx_avx2_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
     192             : #endif
     193             : #ifdef SINGLE_PRECISION_COMPLEX
     194    25121792 : void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
     195             : #endif
     196             : {
     197             :         int i;
     198    98073600 :         int nb = *pnb;
     199    98073600 :         int nq = *pldq;
     200    98073600 :         int ldq = *pldq;
     201             :         //int ldh = *pldh;
     202             :         int worked_on;
     203             : 
     204    98073600 :         worked_on = 0;
     205             : 
     206             : #ifdef DOUBLE_PRECISION_COMPLEX
     207   359931904 :         for (i = 0; i < nq-10; i+=12)
     208             :         {
     209   286980096 :                 hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq);
     210   286980096 :                 worked_on += 12;
     211             :         }
     212             : #endif
     213             : #ifdef SINGLE_PRECISION_COMPLEX
     214    72951808 :         for (i = 0; i < nq-20; i+=24)
     215             :         {
     216    47830016 :                 hh_trafo_complex_kernel_24_AVX_1hv_single(&q[i], hh, nb, ldq);
     217    47830016 :                 worked_on += 24;
     218             :         }
     219             : #endif
     220    98073600 :         if (nq == i)
     221             :         {
     222    90832896 :                 return;
     223             :         }
     224             : #ifdef DOUBLE_PRECISION_COMPLEX
     225     4827136 :         if (nq-i == 10)
     226             :         {
     227           0 :                 hh_trafo_complex_kernel_10_AVX_1hv_double(&q[i], hh, nb, ldq);
     228           0 :                 worked_on += 10;
     229             :         }
     230             : #endif
     231             : #ifdef SINGLE_PRECISION_COMPLEX
     232     2413568 :         if (nq-i == 20)
     233             :         {
     234           0 :                 hh_trafo_complex_kernel_20_AVX_1hv_single(&q[i], hh, nb, ldq);
     235           0 :                 worked_on += 20;
     236             :         }
     237             : #endif
     238             : 
     239             : #ifdef DOUBLE_PRECISION_COMPLEX
     240     4827136 :         if (nq-i == 8)
     241             :         {
     242           0 :                 hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq);
     243           0 :                 worked_on += 8;
     244             :         }
     245             : #endif
     246             : #ifdef SINGLE_PRECISION_COMPLEX
     247     2413568 :         if (nq-i == 16)
     248             :         {
     249     2413568 :                 hh_trafo_complex_kernel_16_AVX_1hv_single(&q[i], hh, nb, ldq);
     250     2413568 :                 worked_on += 16;
     251             :         }
     252             : #endif
     253             : 
     254             : #ifdef DOUBLE_PRECISION_COMPLEX
     255     4827136 :         if (nq-i == 6)
     256             :         {
     257           0 :                 hh_trafo_complex_kernel_6_AVX_1hv_double(&q[i], hh, nb, ldq);
     258           0 :                 worked_on += 6;
     259             :         }
     260             : #endif
     261             : #ifdef SINGLE_PRECISION_COMPLEX
     262     2413568 :         if (nq-i == 12)
     263             :         {
     264           0 :                 hh_trafo_complex_kernel_12_AVX_1hv_single(&q[i], hh, nb, ldq);
     265           0 :                 worked_on += 12;
     266             :         }
     267             : #endif
     268             : #ifdef DOUBLE_PRECISION_COMPLEX
     269     4827136 :        if (nq-i == 4)
     270             :        {
     271     4827136 :                 hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq);
     272     4827136 :                 worked_on += 4;
     273             :        }
     274             : #endif
     275             : #ifdef SINGLE_PRECISION_COMPLEX
     276     2413568 :        if (nq-i == 8)
     277             :        {
     278           0 :                 hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq);
     279           0 :                 worked_on += 8;
     280             :        }
     281             : #endif
     282             : 
     283             : #ifdef DOUBLE_PRECISION_COMPLEX
     284     4827136 :        if (nq-i == 2)
     285             :        {
     286           0 :                 hh_trafo_complex_kernel_2_AVX_1hv_double(&q[i], hh, nb, ldq);
     287           0 :                 worked_on += 2;
     288             :        }
     289             : #endif
     290             : #ifdef SINGLE_PRECISION_COMPLEX
     291     2413568 :        if (nq-i == 4)
     292             :        {
     293           0 :                 hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
     294           0 :                 worked_on += 4;
     295             :        }
     296             : #endif
     297             : #ifdef WITH_DEBUG
     298             :        if (worked_on != nq) {
     299             :          printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
     300             :          abort();
     301             :         }
     302             : #endif
     303             : }
     304             : 
     305             : 
     306             : #ifdef DOUBLE_PRECISION_COMPLEX
     307             : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     308             : #endif
     309             : #ifdef SINGLE_PRECISION_COMPLEX
     310             : static __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     311             : #endif
     312             : {
     313             : #ifdef DOUBLE_PRECISION_COMPLEX
     314   286980096 :         double* q_dbl = (double*)q;
     315   286980096 :         double* hh_dbl = (double*)hh;
     316             : #endif
     317             : #ifdef SINGLE_PRECISION_COMPLEX
     318    47830016 :         float* q_dbl = (float*)q;
     319    47830016 :         float* hh_dbl = (float*)hh;
     320             : #endif
     321             :         __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
     322             :         __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
     323             :         __AVX_DATATYPE h1_real, h1_imag;
     324             :         __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
     325   334810112 :         int i=0;
     326             : 
     327             : #ifdef DOUBLE_PRECISION_COMPLEX
     328   286980096 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     329             : #endif
     330             : #ifdef SINGLE_PRECISION_COMPLEX
     331    47830016 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
     332             : #endif
     333             : 
     334   334810112 :         x1 = _AVX_LOAD(&q_dbl[0]);
     335   669620224 :         x2 = _AVX_LOAD(&q_dbl[offset]);
     336   669620224 :         x3 = _AVX_LOAD(&q_dbl[2*offset]);
     337   669620224 :         x4 = _AVX_LOAD(&q_dbl[3*offset]);
     338   669620224 :         x5 = _AVX_LOAD(&q_dbl[4*offset]);
     339   669620224 :         x6 = _AVX_LOAD(&q_dbl[5*offset]);
     340 10691559424 :         for (i = 1; i < nb; i++)
     341             :         {
     342 20713498624 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     343 20713498624 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     344             : #ifndef __ELPA_USE_FMA__
     345             :                 // conjugate
     346             :                 h1_imag = _AVX_XOR(h1_imag, sign);
     347             : #endif
     348             : 
     349 20713498624 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     350 20713498624 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     351 20713498624 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     352 20713498624 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     353 20713498624 :                 q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     354 20713498624 :                 q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
     355             : 
     356 10356749312 :                 tmp1 = _AVX_MUL(h1_imag, q1);
     357             : #ifdef __ELPA_USE_FMA__
     358 31070247936 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     359             : #else
     360             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     361             : #endif
     362 10356749312 :                 tmp2 = _AVX_MUL(h1_imag, q2);
     363             : #ifdef __ELPA_USE_FMA__
     364 31070247936 :                 x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     365             : #else
     366             :                 x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     367             : #endif
     368 10356749312 :                 tmp3 = _AVX_MUL(h1_imag, q3);
     369             : #ifdef __ELPA_USE_FMA__
     370 31070247936 :                 x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     371             : #else
     372             :                 x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     373             : #endif
     374             : 
     375 10356749312 :                 tmp4 = _AVX_MUL(h1_imag, q4);
     376             : #ifdef __ELPA_USE_FMA__
     377 31070247936 :                 x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     378             : #else
     379             :                 x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     380             : #endif
     381 10356749312 :                 tmp5 = _AVX_MUL(h1_imag, q5);
     382             : #ifdef __ELPA_USE_FMA__
     383 31070247936 :                 x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     384             : #else
     385             :                 x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     386             : #endif
     387 10356749312 :                 tmp6 = _AVX_MUL(h1_imag, q6);
     388             : #ifdef __ELPA_USE_FMA__
     389 31070247936 :                 x6 = _AVX_ADD(x6, _AVX_FMSUBADD(h1_real, q6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     390             : #else
     391             :                 x6 = _AVX_ADD(x6, _AVX_ADDSUB( _AVX_MUL(h1_real, q6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     392             : #endif
     393             :         }
     394             : 
     395   334810112 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
     396   669620224 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
     397   334810112 :         h1_real = _AVX_XOR(h1_real, sign);
     398   334810112 :         h1_imag = _AVX_XOR(h1_imag, sign);
     399             : 
     400   334810112 :         tmp1 = _AVX_MUL(h1_imag, x1);
     401             : #ifdef __ELPA_USE_FMA__
     402   669620224 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     403             : #else
     404             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     405             : #endif
     406   334810112 :         tmp2 = _AVX_MUL(h1_imag, x2);
     407             : #ifdef __ELPA_USE_FMA__
     408   669620224 :         x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     409             : #else
     410             :         x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     411             : #endif
     412   334810112 :         tmp3 = _AVX_MUL(h1_imag, x3);
     413             : #ifdef __ELPA_USE_FMA__
     414   669620224 :         x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     415             : #else
     416             :         x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     417             : #endif
     418             : 
     419   334810112 :         tmp4 = _AVX_MUL(h1_imag, x4);
     420             : #ifdef __ELPA_USE_FMA__
     421   669620224 :         x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     422             : #else
     423             :         x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     424             : #endif
     425   334810112 :         tmp5 = _AVX_MUL(h1_imag, x5);
     426             : #ifdef __ELPA_USE_FMA__
     427   669620224 :         x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     428             : #else
     429             :         x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     430             : #endif
     431   334810112 :         tmp6 = _AVX_MUL(h1_imag, x6);
     432             : #ifdef __ELPA_USE_FMA__
     433   669620224 :         x6 = _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
     434             : #else
     435             :         x6 = _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
     436             : #endif
     437             : 
     438   334810112 :         q1 = _AVX_LOAD(&q_dbl[0]);
     439   669620224 :         q2 = _AVX_LOAD(&q_dbl[offset]);
     440   669620224 :         q3 = _AVX_LOAD(&q_dbl[2*offset]);
     441   669620224 :         q4 = _AVX_LOAD(&q_dbl[3*offset]);
     442   669620224 :         q5 = _AVX_LOAD(&q_dbl[4*offset]);
     443   669620224 :         q6 = _AVX_LOAD(&q_dbl[5*offset]);
     444             : 
     445   334810112 :         q1 = _AVX_ADD(q1, x1);
     446   334810112 :         q2 = _AVX_ADD(q2, x2);
     447   334810112 :         q3 = _AVX_ADD(q3, x3);
     448   334810112 :         q4 = _AVX_ADD(q4, x4);
     449   334810112 :         q5 = _AVX_ADD(q5, x5);
     450   334810112 :         q6 = _AVX_ADD(q6, x6);
     451             : 
     452             :         _AVX_STORE(&q_dbl[0], q1);
     453   334810112 :         _AVX_STORE(&q_dbl[offset], q2);
     454   334810112 :         _AVX_STORE(&q_dbl[2*offset], q3);
     455   334810112 :         _AVX_STORE(&q_dbl[3*offset], q4);
     456   334810112 :         _AVX_STORE(&q_dbl[4*offset], q5);
     457   334810112 :         _AVX_STORE(&q_dbl[5*offset], q6);
     458             : 
     459 10691559424 :         for (i = 1; i < nb; i++)
     460             :         {
     461 20713498624 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     462 20713498624 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     463             : 
     464 20713498624 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     465 20713498624 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     466 20713498624 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     467 20713498624 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     468 20713498624 :                 q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     469 20713498624 :                 q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
     470             : 
     471 10356749312 :                 tmp1 = _AVX_MUL(h1_imag, x1);
     472             : #ifdef __ELPA_USE_FMA__
     473 31070247936 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     474             : #else
     475             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     476             : #endif
     477 10356749312 :                 tmp2 = _AVX_MUL(h1_imag, x2);
     478             : #ifdef __ELPA_USE_FMA__
     479 31070247936 :                 q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     480             : #else
     481             :                 q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     482             : #endif
     483 10356749312 :                 tmp3 = _AVX_MUL(h1_imag, x3);
     484             : #ifdef __ELPA_USE_FMA__
     485 31070247936 :                 q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     486             : #else
     487             :                 q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     488             : #endif
     489             : 
     490 10356749312 :                 tmp4 = _AVX_MUL(h1_imag, x4);
     491             : #ifdef __ELPA_USE_FMA__
     492 31070247936 :                 q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     493             : #else
     494             :                 q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     495             : #endif
     496 10356749312 :                 tmp5 = _AVX_MUL(h1_imag, x5);
     497             : #ifdef __ELPA_USE_FMA__
     498 31070247936 :                 q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     499             : #else
     500             :                 q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     501             : #endif
     502 10356749312 :                 tmp6 = _AVX_MUL(h1_imag, x6);
     503             : #ifdef __ELPA_USE_FMA__
     504 31070247936 :                 q6 = _AVX_ADD(q6, _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     505             : #else
     506             :                 q6 = _AVX_ADD(q6, _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
     507             : #endif
     508             : 
     509 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
     510 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     511 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     512 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     513 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
     514 10356749312 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
     515             :         }
     516             : }
     517             : 
     518             : #ifdef DOUBLE_PRECISION_COMPLEX
     519             : static __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     520             : #endif
     521             : #ifdef SINGLE_PRECISION_COMPLEX
     522             : static __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     523             : #endif
     524             : {
     525             : #ifdef DOUBLE_PRECISION_COMPLEX
     526           0 :         double* q_dbl = (double*)q;
     527           0 :         double* hh_dbl = (double*)hh;
     528             : #endif
     529             : #ifdef SINGLE_PRECISION_COMPLEX
     530           0 :         float* q_dbl = (float*)q;
     531           0 :         float* hh_dbl = (float*)hh;
     532             : #endif
     533             :         __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
     534             :         __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
     535             :         __AVX_DATATYPE h1_real, h1_imag;
     536             :         __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
     537           0 :         int i=0;
     538             : 
     539             : #ifdef DOUBLE_PRECISION_COMPLEX
     540           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     541             : #endif
     542             : #ifdef SINGLE_PRECISION_COMPLEX
     543           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
     544             : #endif
     545             : 
     546           0 :         x1 = _AVX_LOAD(&q_dbl[0]);
     547           0 :         x2 = _AVX_LOAD(&q_dbl[offset]);
     548           0 :         x3 = _AVX_LOAD(&q_dbl[2*offset]);
     549           0 :         x4 = _AVX_LOAD(&q_dbl[3*offset]);
     550           0 :         x5 = _AVX_LOAD(&q_dbl[4*offset]);
     551           0 :         for (i = 1; i < nb; i++)
     552             :         {
     553           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     554           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     555             : #ifndef __ELPA_USE_FMA__
     556             :                 // conjugate
     557             :                 h1_imag = _AVX_XOR(h1_imag, sign);
     558             : #endif
     559             : 
     560           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     561           0 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     562           0 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     563           0 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     564           0 :                 q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     565             : 
     566           0 :                 tmp1 = _AVX_MUL(h1_imag, q1);
     567             : #ifdef __ELPA_USE_FMA__
     568           0 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     569             : #else
     570             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     571             : #endif
     572           0 :                 tmp2 = _AVX_MUL(h1_imag, q2);
     573             : #ifdef __ELPA_USE_FMA__
     574           0 :                 x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     575             : #else
     576             :                 x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     577             : #endif
     578           0 :                 tmp3 = _AVX_MUL(h1_imag, q3);
     579             : #ifdef __ELPA_USE_FMA__
     580           0 :                 x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     581             : #else
     582             :                 x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     583             : #endif
     584             : 
     585           0 :                 tmp4 = _AVX_MUL(h1_imag, q4);
     586             : #ifdef __ELPA_USE_FMA__
     587           0 :                 x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     588             : #else
     589             :                 x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     590             : #endif
     591           0 :                 tmp5 = _AVX_MUL(h1_imag, q5);
     592             : #ifdef __ELPA_USE_FMA__
     593           0 :                 x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     594             : #else
     595             :                 x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     596             : #endif
     597             :         }
     598             : 
     599           0 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
     600           0 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
     601           0 :         h1_real = _AVX_XOR(h1_real, sign);
     602           0 :         h1_imag = _AVX_XOR(h1_imag, sign);
     603             : 
     604           0 :         tmp1 = _AVX_MUL(h1_imag, x1);
     605             : #ifdef __ELPA_USE_FMA__
     606           0 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     607             : #else
     608             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     609             : #endif
     610           0 :         tmp2 = _AVX_MUL(h1_imag, x2);
     611             : #ifdef __ELPA_USE_FMA__
     612           0 :         x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     613             : #else
     614             :         x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     615             : #endif
     616           0 :         tmp3 = _AVX_MUL(h1_imag, x3);
     617             : #ifdef __ELPA_USE_FMA__
     618           0 :         x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     619             : #else
     620             :         x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     621             : #endif
     622             : 
     623           0 :         tmp4 = _AVX_MUL(h1_imag, x4);
     624             : #ifdef __ELPA_USE_FMA__
     625           0 :         x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     626             : #else
     627             :         x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     628             : #endif
     629           0 :         tmp5 = _AVX_MUL(h1_imag, x5);
     630             : #ifdef __ELPA_USE_FMA__
     631           0 :         x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     632             : #else
     633             :         x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
     634             : #endif
     635             : 
     636           0 :         q1 = _AVX_LOAD(&q_dbl[0]);
     637           0 :         q2 = _AVX_LOAD(&q_dbl[offset]);
     638           0 :         q3 = _AVX_LOAD(&q_dbl[2*offset]);
     639           0 :         q4 = _AVX_LOAD(&q_dbl[3*offset]);
     640           0 :         q5 = _AVX_LOAD(&q_dbl[4*offset]);
     641             : 
     642           0 :         q1 = _AVX_ADD(q1, x1);
     643           0 :         q2 = _AVX_ADD(q2, x2);
     644           0 :         q3 = _AVX_ADD(q3, x3);
     645           0 :         q4 = _AVX_ADD(q4, x4);
     646           0 :         q5 = _AVX_ADD(q5, x5);
     647             : 
     648             :         _AVX_STORE(&q_dbl[0], q1);
     649           0 :         _AVX_STORE(&q_dbl[offset], q2);
     650           0 :         _AVX_STORE(&q_dbl[2*offset], q3);
     651           0 :         _AVX_STORE(&q_dbl[3*offset], q4);
     652           0 :         _AVX_STORE(&q_dbl[4*offset], q5);
     653             : 
     654           0 :         for (i = 1; i < nb; i++)
     655             :         {
     656           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     657           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     658             : 
     659           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     660           0 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     661           0 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     662           0 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     663           0 :                 q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
     664             : 
     665           0 :                 tmp1 = _AVX_MUL(h1_imag, x1);
     666             : #ifdef __ELPA_USE_FMA__
     667           0 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     668             : #else
     669             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     670             : #endif
     671           0 :                 tmp2 = _AVX_MUL(h1_imag, x2);
     672             : #ifdef __ELPA_USE_FMA__
     673           0 :                 q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     674             : #else
     675             :                 q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     676             : #endif
     677           0 :                 tmp3 = _AVX_MUL(h1_imag, x3);
     678             : #ifdef __ELPA_USE_FMA__
     679           0 :                 q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     680             : #else
     681             :                 q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     682             : #endif
     683             : 
     684           0 :                 tmp4 = _AVX_MUL(h1_imag, x4);
     685             : #ifdef __ELPA_USE_FMA__
     686           0 :                 q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     687             : #else
     688             :                 q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     689             : #endif
     690           0 :                 tmp5 = _AVX_MUL(h1_imag, x5);
     691             : #ifdef __ELPA_USE_FMA__
     692           0 :                 q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     693             : #else
     694             :                 q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
     695             : #endif
     696             : 
     697           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
     698           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     699           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     700           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     701           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
     702             :         }
     703             : }
     704             : 
     705             : 
     706             : #ifdef DOUBLE_PRECISION_COMPLEX
     707             : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     708             : #endif
     709             : #ifdef SINGLE_PRECISION_COMPLEX
     710             : static __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     711             : #endif
     712             : {
     713             : 
     714             : #ifdef DOUBLE_PRECISION_COMPLEX
     715           0 :         double* q_dbl = (double*)q;
     716           0 :         double* hh_dbl = (double*)hh;
     717             : #endif
     718             : #ifdef SINGLE_PRECISION_COMPLEX
     719     2413568 :         float* q_dbl = (float*)q;
     720     2413568 :         float* hh_dbl = (float*)hh;
     721             : #endif
     722             :         __AVX_DATATYPE x1, x2, x3, x4;
     723             :         __AVX_DATATYPE q1, q2, q3, q4;
     724             :         __AVX_DATATYPE h1_real, h1_imag;
     725             :         __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
     726     2413568 :         int i=0;
     727             : 
     728             : #ifdef DOUBLE_PRECISION_COMPLEX
     729           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     730             : #endif
     731             : #ifdef SINGLE_PRECISION_COMPLEX
     732     2413568 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
     733             : #endif
     734             : 
     735     2413568 :         x1 = _AVX_LOAD(&q_dbl[0]);
     736     4827136 :         x2 = _AVX_LOAD(&q_dbl[offset]);
     737     4827136 :         x3 = _AVX_LOAD(&q_dbl[2*offset]);
     738     4827136 :         x4 = _AVX_LOAD(&q_dbl[3*offset]);
     739             : 
     740    74039296 :         for (i = 1; i < nb; i++)
     741             :         {
     742   143251456 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     743   143251456 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     744             : #ifndef __ELPA_USE_FMA__
     745             :                 // conjugate
     746             :                 h1_imag = _AVX_XOR(h1_imag, sign);
     747             : #endif
     748             : 
     749             : 
     750   143251456 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     751   143251456 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     752   143251456 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     753   143251456 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     754             : 
     755    71625728 :                 tmp1 = _AVX_MUL(h1_imag, q1);
     756             : #ifdef __ELPA_USE_FMA__
     757   214877184 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     758             : #else
     759             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     760             : #endif
     761    71625728 :                 tmp2 = _AVX_MUL(h1_imag, q2);
     762             : #ifdef __ELPA_USE_FMA__
     763   214877184 :                 x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     764             : #else
     765             :                 x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     766             : #endif
     767             : 
     768    71625728 :                 tmp3 = _AVX_MUL(h1_imag, q3);
     769             : #ifdef __ELPA_USE_FMA__
     770   214877184 :                 x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     771             : #else
     772             :                 x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     773             : #endif
     774    71625728 :                 tmp4 = _AVX_MUL(h1_imag, q4);
     775             : #ifdef __ELPA_USE_FMA__
     776   214877184 :                 x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     777             : #else
     778             :                 x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     779             : #endif
     780             :         }
     781             : 
     782     2413568 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
     783     4827136 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
     784     2413568 :         h1_real = _AVX_XOR(h1_real, sign);
     785     2413568 :         h1_imag = _AVX_XOR(h1_imag, sign);
     786             : 
     787     2413568 :         tmp1 = _AVX_MUL(h1_imag, x1);
     788             : #ifdef __ELPA_USE_FMA__
     789     4827136 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     790             : #else
     791             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     792             : #endif
     793     2413568 :         tmp2 = _AVX_MUL(h1_imag, x2);
     794             : #ifdef __ELPA_USE_FMA__
     795     4827136 :         x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     796             : #else
     797             :         x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     798             : #endif
     799             : 
     800     2413568 :         tmp3 = _AVX_MUL(h1_imag, x3);
     801             : #ifdef __ELPA_USE_FMA__
     802     4827136 :         x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     803             : #else
     804             :         x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     805             : #endif
     806     2413568 :         tmp4 = _AVX_MUL(h1_imag, x4);
     807             : #ifdef __ELPA_USE_FMA__
     808     4827136 :         x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     809             : #else
     810             :         x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
     811             : #endif
     812             : 
     813     2413568 :         q1 = _AVX_LOAD(&q_dbl[0]);
     814     4827136 :         q2 = _AVX_LOAD(&q_dbl[offset]);
     815     4827136 :         q3 = _AVX_LOAD(&q_dbl[2*offset]);
     816     4827136 :         q4 = _AVX_LOAD(&q_dbl[3*offset]);
     817             : 
     818     2413568 :         q1 = _AVX_ADD(q1, x1);
     819     2413568 :         q2 = _AVX_ADD(q2, x2);
     820     2413568 :         q3 = _AVX_ADD(q3, x3);
     821     2413568 :         q4 = _AVX_ADD(q4, x4);
     822             : 
     823             :         _AVX_STORE(&q_dbl[0], q1);
     824     2413568 :         _AVX_STORE(&q_dbl[offset], q2);
     825     2413568 :         _AVX_STORE(&q_dbl[2*offset], q3);
     826     2413568 :         _AVX_STORE(&q_dbl[3*offset], q4);
     827             : 
     828    74039296 :         for (i = 1; i < nb; i++)
     829             :         {
     830   143251456 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     831   143251456 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     832             : 
     833   143251456 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     834   143251456 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     835   143251456 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     836   143251456 :                 q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
     837             : 
     838    71625728 :                 tmp1 = _AVX_MUL(h1_imag, x1);
     839             : #ifdef __ELPA_USE_FMA__
     840   214877184 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     841             : #else
     842             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     843             : #endif
     844    71625728 :                 tmp2 = _AVX_MUL(h1_imag, x2);
     845             : #ifdef __ELPA_USE_FMA__
     846   214877184 :                 q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     847             : #else
     848             :                 q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     849             : #endif
     850             : 
     851    71625728 :                 tmp3 = _AVX_MUL(h1_imag, x3);
     852             : #ifdef __ELPA_USE_FMA__
     853   214877184 :                 q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     854             : #else
     855             :                 q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     856             : #endif
     857    71625728 :                 tmp4 = _AVX_MUL(h1_imag, x4);
     858             : #ifdef __ELPA_USE_FMA__
     859   214877184 :                 q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     860             : #else
     861             :                 q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
     862             : #endif
     863             : 
     864    71625728 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
     865    71625728 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
     866    71625728 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
     867    71625728 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
     868             :         }
     869             : }
     870             : 
     871             : #ifdef DOUBLE_PRECISION_COMPLEX
     872             : static __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
     873             : #endif
     874             : #ifdef SINGLE_PRECISION_COMPLEX
     875             : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
     876             : #endif
     877             : {
     878             : 
     879             : #ifdef DOUBLE_PRECISION_COMPLEX
     880           0 :         double* q_dbl = (double*)q;
     881           0 :         double* hh_dbl = (double*)hh;
     882             : #endif
     883             : #ifdef SINGLE_PRECISION_COMPLEX
     884           0 :         float* q_dbl = (float*)q;
     885           0 :         float* hh_dbl = (float*)hh;
     886             : #endif
     887             :         __AVX_DATATYPE x1, x2, x3, x4;
     888             :         __AVX_DATATYPE q1, q2, q3, q4;
     889             :         __AVX_DATATYPE h1_real, h1_imag;
     890             :         __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
     891           0 :         int i=0;
     892             : 
     893             : #ifdef DOUBLE_PRECISION_COMPLEX
     894           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
     895             : #endif
     896             : #ifdef SINGLE_PRECISION_COMPLEX
     897           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
     898             : #endif
     899             : 
     900           0 :         x1 = _AVX_LOAD(&q_dbl[0]);
     901           0 :         x2 = _AVX_LOAD(&q_dbl[offset]);
     902           0 :         x3 = _AVX_LOAD(&q_dbl[2*offset]);
     903             : 
     904           0 :         for (i = 1; i < nb; i++)
     905             :         {
     906           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     907           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     908             : #ifndef __ELPA_USE_FMA__
     909             :                 // conjugate
     910             :                 h1_imag = _AVX_XOR(h1_imag, sign);
     911             : #endif
     912             : 
     913             : 
     914           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     915           0 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     916           0 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     917             : 
     918           0 :                 tmp1 = _AVX_MUL(h1_imag, q1);
     919             : #ifdef __ELPA_USE_FMA__
     920           0 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     921             : #else
     922             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     923             : #endif
     924           0 :                 tmp2 = _AVX_MUL(h1_imag, q2);
     925             : #ifdef __ELPA_USE_FMA__
     926           0 :                 x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     927             : #else
     928             :                 x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     929             : #endif
     930             : 
     931           0 :                 tmp3 = _AVX_MUL(h1_imag, q3);
     932             : #ifdef __ELPA_USE_FMA__
     933           0 :                 x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     934             : #else
     935             :                 x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
     936             : #endif
     937             :         }
     938             : 
     939           0 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
     940           0 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
     941           0 :         h1_real = _AVX_XOR(h1_real, sign);
     942           0 :         h1_imag = _AVX_XOR(h1_imag, sign);
     943             : 
     944           0 :         tmp1 = _AVX_MUL(h1_imag, x1);
     945             : #ifdef __ELPA_USE_FMA__
     946           0 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     947             : #else
     948             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
     949             : #endif
     950           0 :         tmp2 = _AVX_MUL(h1_imag, x2);
     951             : #ifdef __ELPA_USE_FMA__
     952           0 :         x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     953             : #else
     954             :         x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
     955             : #endif
     956             : 
     957           0 :         tmp3 = _AVX_MUL(h1_imag, x3);
     958             : #ifdef __ELPA_USE_FMA__
     959           0 :         x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     960             : #else
     961             :         x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
     962             : #endif
     963             : 
     964           0 :         q1 = _AVX_LOAD(&q_dbl[0]);
     965           0 :         q2 = _AVX_LOAD(&q_dbl[offset]);
     966           0 :         q3 = _AVX_LOAD(&q_dbl[2*offset]);
     967             : 
     968           0 :         q1 = _AVX_ADD(q1, x1);
     969           0 :         q2 = _AVX_ADD(q2, x2);
     970           0 :         q3 = _AVX_ADD(q3, x3);
     971             : 
     972             :         _AVX_STORE(&q_dbl[0], q1);
     973           0 :         _AVX_STORE(&q_dbl[offset], q2);
     974           0 :         _AVX_STORE(&q_dbl[2*offset], q3);
     975             : 
     976           0 :         for (i = 1; i < nb; i++)
     977             :         {
     978           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
     979           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
     980             : 
     981           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
     982           0 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
     983           0 :                 q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
     984             : 
     985           0 :                 tmp1 = _AVX_MUL(h1_imag, x1);
     986             : #ifdef __ELPA_USE_FMA__
     987           0 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     988             : #else
     989             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
     990             : #endif
     991           0 :                 tmp2 = _AVX_MUL(h1_imag, x2);
     992             : #ifdef __ELPA_USE_FMA__
     993           0 :                 q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     994             : #else
     995             :                 q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
     996             : #endif
     997             : 
     998           0 :                 tmp3 = _AVX_MUL(h1_imag, x3);
     999             : #ifdef __ELPA_USE_FMA__
    1000           0 :                 q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
    1001             : #else
    1002             :                 q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
    1003             : #endif
    1004             : 
    1005           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1006           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
    1007           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
    1008             :         }
    1009             : }
    1010             : 
    1011             : #ifdef DOUBLE_PRECISION_COMPLEX
    1012             : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
    1013             : #endif
    1014             : #ifdef SINGLE_PRECISION_COMPLEX
    1015             : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
    1016             : #endif
    1017             : {
    1018             : 
    1019             : #ifdef DOUBLE_PRECISION_COMPLEX
    1020     4827136 :         double* q_dbl = (double*)q;
    1021     4827136 :         double* hh_dbl = (double*)hh;
    1022             : #endif
    1023             : #ifdef SINGLE_PRECISION_COMPLEX
    1024           0 :         float* q_dbl = (float*)q;
    1025           0 :         float* hh_dbl = (float*)hh;
    1026             : #endif
    1027             :         __AVX_DATATYPE x1, x2;
    1028             :         __AVX_DATATYPE q1, q2;
    1029             :         __AVX_DATATYPE h1_real, h1_imag;
    1030             :         __AVX_DATATYPE tmp1, tmp2;
    1031     4827136 :         int i=0;
    1032             : 
    1033             : #ifdef DOUBLE_PRECISION_COMPLEX
    1034     4827136 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    1035             : #endif
    1036             : #ifdef SINGLE_PRECISION_COMPLEX
    1037           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
    1038             : #endif
    1039             : 
    1040     4827136 :         x1 = _AVX_LOAD(&q_dbl[0]);
    1041     9654272 :         x2 = _AVX_LOAD(&q_dbl[offset]);
    1042   148078592 :         for (i = 1; i < nb; i++)
    1043             :         {
    1044   286502912 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
    1045   286502912 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
    1046             : #ifndef __ELPA_USE_FMA__
    1047             :                 // conjugate
    1048             :                 h1_imag = _AVX_XOR(h1_imag, sign);
    1049             : #endif
    1050             : 
    1051   286502912 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
    1052   286502912 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
    1053             : 
    1054   143251456 :                 tmp1 = _AVX_MUL(h1_imag, q1);
    1055             : #ifdef __ELPA_USE_FMA__
    1056   429754368 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1057             : #else
    1058             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1059             : #endif
    1060             : 
    1061   143251456 :                 tmp2 = _AVX_MUL(h1_imag, q2);
    1062             : #ifdef __ELPA_USE_FMA__
    1063   429754368 :                 x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1064             : #else
    1065             :                 x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1066             : #endif
    1067             :         }
    1068             : 
    1069     4827136 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
    1070     9654272 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
    1071     4827136 :         h1_real = _AVX_XOR(h1_real, sign);
    1072     4827136 :         h1_imag = _AVX_XOR(h1_imag, sign);
    1073             : 
    1074     4827136 :         tmp1 = _AVX_MUL(h1_imag, x1);
    1075             : #ifdef __ELPA_USE_FMA__
    1076     9654272 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1077             : #else
    1078             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1079             : #endif
    1080             : 
    1081     4827136 :         tmp2 = _AVX_MUL(h1_imag, x2);
    1082             : #ifdef __ELPA_USE_FMA__
    1083     9654272 :         x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
    1084             : #else
    1085             :         x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
    1086             : #endif
    1087             : 
    1088     4827136 :         q1 = _AVX_LOAD(&q_dbl[0]);
    1089     9654272 :         q2 = _AVX_LOAD(&q_dbl[offset]);
    1090             : 
    1091     4827136 :         q1 = _AVX_ADD(q1, x1);
    1092     4827136 :         q2 = _AVX_ADD(q2, x2);
    1093             :         _AVX_STORE(&q_dbl[0], q1);
    1094     4827136 :         _AVX_STORE(&q_dbl[offset], q2);
    1095   148078592 :         for (i = 1; i < nb; i++)
    1096             :         {
    1097   286502912 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
    1098   286502912 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
    1099             : 
    1100   286502912 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
    1101   286502912 :                 q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
    1102             : 
    1103   143251456 :                 tmp1 = _AVX_MUL(h1_imag, x1);
    1104             : #ifdef __ELPA_USE_FMA__
    1105   429754368 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1106             : #else
    1107             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1108             : #endif
    1109             : 
    1110   143251456 :                 tmp2 = _AVX_MUL(h1_imag, x2);
    1111             : #ifdef __ELPA_USE_FMA__
    1112   429754368 :                 q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1113             : #else
    1114             :                 q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
    1115             : #endif
    1116             : 
    1117   143251456 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1118   143251456 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
    1119             :         }
    1120             : }
    1121             : 
    1122             : 
    1123             : #ifdef DOUBLE_PRECISION_COMPLEX
    1124             : static __forceinline void hh_trafo_complex_kernel_2_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
    1125             : #endif
    1126             : #ifdef SINGLE_PRECISION_COMPLEX
    1127             : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
    1128             : #endif
    1129             : {
    1130             : 
    1131             : #ifdef DOUBLE_PRECISION_COMPLEX
    1132           0 :         double* q_dbl = (double*)q;
    1133           0 :         double* hh_dbl = (double*)hh;
    1134             : #endif
    1135             : #ifdef SINGLE_PRECISION_COMPLEX
    1136           0 :         float* q_dbl = (float*)q;
    1137           0 :         float* hh_dbl = (float*)hh;
    1138             : #endif
    1139             :         __AVX_DATATYPE x1, x2;
    1140             :         __AVX_DATATYPE q1, q2;
    1141             :         __AVX_DATATYPE h1_real, h1_imag;
    1142             :         __AVX_DATATYPE tmp1, tmp2;
    1143           0 :         int i=0;
    1144             : 
    1145             : #ifdef DOUBLE_PRECISION_COMPLEX
    1146           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    1147             : #endif
    1148             : #ifdef SINGLE_PRECISION_COMPLEX
    1149           0 :         __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
    1150             : #endif
    1151             : 
    1152           0 :         x1 = _AVX_LOAD(&q_dbl[0]);
    1153           0 :         for (i = 1; i < nb; i++)
    1154             :         {
    1155           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
    1156           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
    1157             : #ifndef __ELPA_USE_FMA__
    1158             :                 // conjugate
    1159             :                 h1_imag = _AVX_XOR(h1_imag, sign);
    1160             : #endif
    1161             : 
    1162           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
    1163             : 
    1164           0 :                 tmp1 = _AVX_MUL(h1_imag, q1);
    1165             : #ifdef __ELPA_USE_FMA__
    1166           0 :                 x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1167             : #else
    1168             :                 x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1169             : #endif
    1170             : 
    1171             :         }
    1172             : 
    1173           0 :         h1_real = _AVX_BROADCAST(&hh_dbl[0]);
    1174           0 :         h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
    1175           0 :         h1_real = _AVX_XOR(h1_real, sign);
    1176           0 :         h1_imag = _AVX_XOR(h1_imag, sign);
    1177             : 
    1178           0 :         tmp1 = _AVX_MUL(h1_imag, x1);
    1179             : #ifdef __ELPA_USE_FMA__
    1180           0 :         x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1181             : #else
    1182             :         x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
    1183             : #endif
    1184             : 
    1185           0 :         q1 = _AVX_LOAD(&q_dbl[0]);
    1186             : 
    1187           0 :         q1 = _AVX_ADD(q1, x1);
    1188             :         _AVX_STORE(&q_dbl[0], q1);
    1189           0 :         for (i = 1; i < nb; i++)
    1190             :         {
    1191           0 :                 h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
    1192           0 :                 h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
    1193             : 
    1194           0 :                 q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
    1195             : 
    1196           0 :                 tmp1 = _AVX_MUL(h1_imag, x1);
    1197             : #ifdef __ELPA_USE_FMA__
    1198           0 :                 q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1199             : #else
    1200             :                 q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
    1201             : #endif
    1202             : 
    1203           0 :                 _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
    1204             :         }
    1205             : }
    1206             : 
    1207             : 
    1208             : 
    1209             : 

Generated by: LCOV version 1.12