Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 : #include "config-f90.h"
63 :
64 : #include <complex.h>
65 : #include <x86intrin.h>
66 : #include <stdio.h>
67 : #include <stdlib.h>
68 :
69 : #define __forceinline __attribute__((always_inline))
70 :
71 : #ifdef DOUBLE_PRECISION_COMPLEX
72 : #define offset 4
73 : #define __AVX_DATATYPE __m256d
74 : #define _AVX_LOAD _mm256_load_pd
75 : #define _AVX_STORE _mm256_store_pd
76 : #define _AVX_ADD _mm256_add_pd
77 : #define _AVX_MUL _mm256_mul_pd
78 : #define _AVX_ADDSUB _mm256_addsub_pd
79 : #define _AVX_XOR _mm256_xor_pd
80 : #define _AVX_BROADCAST _mm256_broadcast_sd
81 : #define _AVX_SHUFFLE _mm256_shuffle_pd
82 : #define _SHUFFLE 0x5
83 :
84 : #ifdef HAVE_AVX2
85 :
86 : #ifdef __FMA4__
87 : #define __ELPA_USE_FMA__
88 : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
89 : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
90 : #endif
91 :
92 : #ifdef __AVX2__
93 : #define __ELPA_USE_FMA__
94 : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
95 : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
96 : #endif
97 :
98 : #endif
99 :
100 : #define _AVX_FMADDSUB _mm256_FMADDSUB_pd
101 : #define _AVX_FMSUBADD _mm256_FMSUBADD_pd
102 :
103 : #endif /* DOUBLE_PRECISION_COMPLEX */
104 :
105 : #ifdef SINGLE_PRECISION_COMPLEX
106 : #define offset 8
107 : #define __AVX_DATATYPE __m256
108 : #define _AVX_LOAD _mm256_load_ps
109 : #define _AVX_STORE _mm256_store_ps
110 : #define _AVX_ADD _mm256_add_ps
111 : #define _AVX_MUL _mm256_mul_ps
112 : #define _AVX_ADDSUB _mm256_addsub_ps
113 : #define _AVX_XOR _mm256_xor_ps
114 : #define _AVX_BROADCAST _mm256_broadcast_ss
115 : #define _AVX_SHUFFLE _mm256_shuffle_ps
116 : #define _SHUFFLE 0xb1
117 :
118 : #ifdef HAVE_AVX2
119 :
120 : #ifdef __FMA4__
121 : #define __ELPA_USE_FMA__
122 : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
123 : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
124 : #endif
125 :
126 : #ifdef __AVX2__
127 : #define __ELPA_USE_FMA__
128 : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
129 : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
130 : #endif
131 :
132 : #endif
133 :
134 : #define _AVX_FMADDSUB _mm256_FMADDSUB_ps
135 : #define _AVX_FMSUBADD _mm256_FMSUBADD_ps
136 : #endif /* SINGLE_PRECISION_COMPLEX */
137 :
138 : #ifdef DOUBLE_PRECISION_COMPLEX
139 : //Forward declaration
140 : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
141 : static __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
142 : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
143 : static __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
144 : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
145 : static __forceinline void hh_trafo_complex_kernel_2_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
146 : #endif
147 : #ifdef SINGLE_PRECISION_COMPLEX
148 : //Forward declaration
149 : static __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
150 : static __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
151 : static __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
152 : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
153 : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
154 : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
155 : #endif
156 :
157 : #ifdef DOUBLE_PRECISION_COMPLEX
158 : /*
159 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
160 : !f> interface
161 : !f> subroutine single_hh_trafo_complex_avx_avx2_1hv_double(q, hh, pnb, pnq, pldq) &
162 : !f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_double")
163 : !f> use, intrinsic :: iso_c_binding
164 : !f> integer(kind=c_int) :: pnb, pnq, pldq
165 : !f> ! complex(kind=c_double_complex) :: q(*)
166 : !f> type(c_ptr), value :: q
167 : !f> complex(kind=c_double_complex) :: hh(pnb,2)
168 : !f> end subroutine
169 : !f> end interface
170 : !f>#endif
171 : */
172 : #endif
173 : #ifdef SINGLE_PRECISION_COMPLEX
174 : /*
175 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
176 : !f> interface
177 : !f> subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
178 : !f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
179 : !f> use, intrinsic :: iso_c_binding
180 : !f> integer(kind=c_int) :: pnb, pnq, pldq
181 : !f> ! complex(kind=c_float_complex) :: q(*)
182 : !f> type(c_ptr), value :: q
183 : !f> complex(kind=c_float_complex) :: hh(pnb,2)
184 : !f> end subroutine
185 : !f> end interface
186 : !f>#endif
187 : */
188 : #endif
189 :
190 : #ifdef DOUBLE_PRECISION_COMPLEX
191 72951808 : void single_hh_trafo_complex_avx_avx2_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
192 : #endif
193 : #ifdef SINGLE_PRECISION_COMPLEX
194 25121792 : void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
195 : #endif
196 : {
197 : int i;
198 98073600 : int nb = *pnb;
199 98073600 : int nq = *pldq;
200 98073600 : int ldq = *pldq;
201 : //int ldh = *pldh;
202 : int worked_on;
203 :
204 98073600 : worked_on = 0;
205 :
206 : #ifdef DOUBLE_PRECISION_COMPLEX
207 359931904 : for (i = 0; i < nq-10; i+=12)
208 : {
209 286980096 : hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq);
210 286980096 : worked_on += 12;
211 : }
212 : #endif
213 : #ifdef SINGLE_PRECISION_COMPLEX
214 72951808 : for (i = 0; i < nq-20; i+=24)
215 : {
216 47830016 : hh_trafo_complex_kernel_24_AVX_1hv_single(&q[i], hh, nb, ldq);
217 47830016 : worked_on += 24;
218 : }
219 : #endif
220 98073600 : if (nq == i)
221 : {
222 90832896 : return;
223 : }
224 : #ifdef DOUBLE_PRECISION_COMPLEX
225 4827136 : if (nq-i == 10)
226 : {
227 0 : hh_trafo_complex_kernel_10_AVX_1hv_double(&q[i], hh, nb, ldq);
228 0 : worked_on += 10;
229 : }
230 : #endif
231 : #ifdef SINGLE_PRECISION_COMPLEX
232 2413568 : if (nq-i == 20)
233 : {
234 0 : hh_trafo_complex_kernel_20_AVX_1hv_single(&q[i], hh, nb, ldq);
235 0 : worked_on += 20;
236 : }
237 : #endif
238 :
239 : #ifdef DOUBLE_PRECISION_COMPLEX
240 4827136 : if (nq-i == 8)
241 : {
242 0 : hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq);
243 0 : worked_on += 8;
244 : }
245 : #endif
246 : #ifdef SINGLE_PRECISION_COMPLEX
247 2413568 : if (nq-i == 16)
248 : {
249 2413568 : hh_trafo_complex_kernel_16_AVX_1hv_single(&q[i], hh, nb, ldq);
250 2413568 : worked_on += 16;
251 : }
252 : #endif
253 :
254 : #ifdef DOUBLE_PRECISION_COMPLEX
255 4827136 : if (nq-i == 6)
256 : {
257 0 : hh_trafo_complex_kernel_6_AVX_1hv_double(&q[i], hh, nb, ldq);
258 0 : worked_on += 6;
259 : }
260 : #endif
261 : #ifdef SINGLE_PRECISION_COMPLEX
262 2413568 : if (nq-i == 12)
263 : {
264 0 : hh_trafo_complex_kernel_12_AVX_1hv_single(&q[i], hh, nb, ldq);
265 0 : worked_on += 12;
266 : }
267 : #endif
268 : #ifdef DOUBLE_PRECISION_COMPLEX
269 4827136 : if (nq-i == 4)
270 : {
271 4827136 : hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq);
272 4827136 : worked_on += 4;
273 : }
274 : #endif
275 : #ifdef SINGLE_PRECISION_COMPLEX
276 2413568 : if (nq-i == 8)
277 : {
278 0 : hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq);
279 0 : worked_on += 8;
280 : }
281 : #endif
282 :
283 : #ifdef DOUBLE_PRECISION_COMPLEX
284 4827136 : if (nq-i == 2)
285 : {
286 0 : hh_trafo_complex_kernel_2_AVX_1hv_double(&q[i], hh, nb, ldq);
287 0 : worked_on += 2;
288 : }
289 : #endif
290 : #ifdef SINGLE_PRECISION_COMPLEX
291 2413568 : if (nq-i == 4)
292 : {
293 0 : hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
294 0 : worked_on += 4;
295 : }
296 : #endif
297 : #ifdef WITH_DEBUG
298 : if (worked_on != nq) {
299 : printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
300 : abort();
301 : }
302 : #endif
303 : }
304 :
305 :
306 : #ifdef DOUBLE_PRECISION_COMPLEX
307 : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
308 : #endif
309 : #ifdef SINGLE_PRECISION_COMPLEX
310 : static __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
311 : #endif
312 : {
313 : #ifdef DOUBLE_PRECISION_COMPLEX
314 286980096 : double* q_dbl = (double*)q;
315 286980096 : double* hh_dbl = (double*)hh;
316 : #endif
317 : #ifdef SINGLE_PRECISION_COMPLEX
318 47830016 : float* q_dbl = (float*)q;
319 47830016 : float* hh_dbl = (float*)hh;
320 : #endif
321 : __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
322 : __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
323 : __AVX_DATATYPE h1_real, h1_imag;
324 : __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
325 334810112 : int i=0;
326 :
327 : #ifdef DOUBLE_PRECISION_COMPLEX
328 286980096 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
329 : #endif
330 : #ifdef SINGLE_PRECISION_COMPLEX
331 47830016 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
332 : #endif
333 :
334 334810112 : x1 = _AVX_LOAD(&q_dbl[0]);
335 669620224 : x2 = _AVX_LOAD(&q_dbl[offset]);
336 669620224 : x3 = _AVX_LOAD(&q_dbl[2*offset]);
337 669620224 : x4 = _AVX_LOAD(&q_dbl[3*offset]);
338 669620224 : x5 = _AVX_LOAD(&q_dbl[4*offset]);
339 669620224 : x6 = _AVX_LOAD(&q_dbl[5*offset]);
340 10691559424 : for (i = 1; i < nb; i++)
341 : {
342 20713498624 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
343 20713498624 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
344 : #ifndef __ELPA_USE_FMA__
345 : // conjugate
346 : h1_imag = _AVX_XOR(h1_imag, sign);
347 : #endif
348 :
349 20713498624 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
350 20713498624 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
351 20713498624 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
352 20713498624 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
353 20713498624 : q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
354 20713498624 : q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
355 :
356 10356749312 : tmp1 = _AVX_MUL(h1_imag, q1);
357 : #ifdef __ELPA_USE_FMA__
358 31070247936 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
359 : #else
360 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
361 : #endif
362 10356749312 : tmp2 = _AVX_MUL(h1_imag, q2);
363 : #ifdef __ELPA_USE_FMA__
364 31070247936 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
365 : #else
366 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
367 : #endif
368 10356749312 : tmp3 = _AVX_MUL(h1_imag, q3);
369 : #ifdef __ELPA_USE_FMA__
370 31070247936 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
371 : #else
372 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
373 : #endif
374 :
375 10356749312 : tmp4 = _AVX_MUL(h1_imag, q4);
376 : #ifdef __ELPA_USE_FMA__
377 31070247936 : x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
378 : #else
379 : x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
380 : #endif
381 10356749312 : tmp5 = _AVX_MUL(h1_imag, q5);
382 : #ifdef __ELPA_USE_FMA__
383 31070247936 : x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
384 : #else
385 : x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
386 : #endif
387 10356749312 : tmp6 = _AVX_MUL(h1_imag, q6);
388 : #ifdef __ELPA_USE_FMA__
389 31070247936 : x6 = _AVX_ADD(x6, _AVX_FMSUBADD(h1_real, q6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
390 : #else
391 : x6 = _AVX_ADD(x6, _AVX_ADDSUB( _AVX_MUL(h1_real, q6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
392 : #endif
393 : }
394 :
395 334810112 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
396 669620224 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
397 334810112 : h1_real = _AVX_XOR(h1_real, sign);
398 334810112 : h1_imag = _AVX_XOR(h1_imag, sign);
399 :
400 334810112 : tmp1 = _AVX_MUL(h1_imag, x1);
401 : #ifdef __ELPA_USE_FMA__
402 669620224 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
403 : #else
404 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
405 : #endif
406 334810112 : tmp2 = _AVX_MUL(h1_imag, x2);
407 : #ifdef __ELPA_USE_FMA__
408 669620224 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
409 : #else
410 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
411 : #endif
412 334810112 : tmp3 = _AVX_MUL(h1_imag, x3);
413 : #ifdef __ELPA_USE_FMA__
414 669620224 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
415 : #else
416 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
417 : #endif
418 :
419 334810112 : tmp4 = _AVX_MUL(h1_imag, x4);
420 : #ifdef __ELPA_USE_FMA__
421 669620224 : x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
422 : #else
423 : x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
424 : #endif
425 334810112 : tmp5 = _AVX_MUL(h1_imag, x5);
426 : #ifdef __ELPA_USE_FMA__
427 669620224 : x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
428 : #else
429 : x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
430 : #endif
431 334810112 : tmp6 = _AVX_MUL(h1_imag, x6);
432 : #ifdef __ELPA_USE_FMA__
433 669620224 : x6 = _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
434 : #else
435 : x6 = _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
436 : #endif
437 :
438 334810112 : q1 = _AVX_LOAD(&q_dbl[0]);
439 669620224 : q2 = _AVX_LOAD(&q_dbl[offset]);
440 669620224 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
441 669620224 : q4 = _AVX_LOAD(&q_dbl[3*offset]);
442 669620224 : q5 = _AVX_LOAD(&q_dbl[4*offset]);
443 669620224 : q6 = _AVX_LOAD(&q_dbl[5*offset]);
444 :
445 334810112 : q1 = _AVX_ADD(q1, x1);
446 334810112 : q2 = _AVX_ADD(q2, x2);
447 334810112 : q3 = _AVX_ADD(q3, x3);
448 334810112 : q4 = _AVX_ADD(q4, x4);
449 334810112 : q5 = _AVX_ADD(q5, x5);
450 334810112 : q6 = _AVX_ADD(q6, x6);
451 :
452 : _AVX_STORE(&q_dbl[0], q1);
453 334810112 : _AVX_STORE(&q_dbl[offset], q2);
454 334810112 : _AVX_STORE(&q_dbl[2*offset], q3);
455 334810112 : _AVX_STORE(&q_dbl[3*offset], q4);
456 334810112 : _AVX_STORE(&q_dbl[4*offset], q5);
457 334810112 : _AVX_STORE(&q_dbl[5*offset], q6);
458 :
459 10691559424 : for (i = 1; i < nb; i++)
460 : {
461 20713498624 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
462 20713498624 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
463 :
464 20713498624 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
465 20713498624 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
466 20713498624 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
467 20713498624 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
468 20713498624 : q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
469 20713498624 : q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
470 :
471 10356749312 : tmp1 = _AVX_MUL(h1_imag, x1);
472 : #ifdef __ELPA_USE_FMA__
473 31070247936 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
474 : #else
475 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
476 : #endif
477 10356749312 : tmp2 = _AVX_MUL(h1_imag, x2);
478 : #ifdef __ELPA_USE_FMA__
479 31070247936 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
480 : #else
481 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
482 : #endif
483 10356749312 : tmp3 = _AVX_MUL(h1_imag, x3);
484 : #ifdef __ELPA_USE_FMA__
485 31070247936 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
486 : #else
487 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
488 : #endif
489 :
490 10356749312 : tmp4 = _AVX_MUL(h1_imag, x4);
491 : #ifdef __ELPA_USE_FMA__
492 31070247936 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
493 : #else
494 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
495 : #endif
496 10356749312 : tmp5 = _AVX_MUL(h1_imag, x5);
497 : #ifdef __ELPA_USE_FMA__
498 31070247936 : q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
499 : #else
500 : q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
501 : #endif
502 10356749312 : tmp6 = _AVX_MUL(h1_imag, x6);
503 : #ifdef __ELPA_USE_FMA__
504 31070247936 : q6 = _AVX_ADD(q6, _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
505 : #else
506 : q6 = _AVX_ADD(q6, _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
507 : #endif
508 :
509 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
510 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
511 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
512 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
513 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
514 10356749312 : _AVX_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
515 : }
516 : }
517 :
518 : #ifdef DOUBLE_PRECISION_COMPLEX
519 : static __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
520 : #endif
521 : #ifdef SINGLE_PRECISION_COMPLEX
522 : static __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
523 : #endif
524 : {
525 : #ifdef DOUBLE_PRECISION_COMPLEX
526 0 : double* q_dbl = (double*)q;
527 0 : double* hh_dbl = (double*)hh;
528 : #endif
529 : #ifdef SINGLE_PRECISION_COMPLEX
530 0 : float* q_dbl = (float*)q;
531 0 : float* hh_dbl = (float*)hh;
532 : #endif
533 : __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
534 : __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
535 : __AVX_DATATYPE h1_real, h1_imag;
536 : __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
537 0 : int i=0;
538 :
539 : #ifdef DOUBLE_PRECISION_COMPLEX
540 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
541 : #endif
542 : #ifdef SINGLE_PRECISION_COMPLEX
543 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
544 : #endif
545 :
546 0 : x1 = _AVX_LOAD(&q_dbl[0]);
547 0 : x2 = _AVX_LOAD(&q_dbl[offset]);
548 0 : x3 = _AVX_LOAD(&q_dbl[2*offset]);
549 0 : x4 = _AVX_LOAD(&q_dbl[3*offset]);
550 0 : x5 = _AVX_LOAD(&q_dbl[4*offset]);
551 0 : for (i = 1; i < nb; i++)
552 : {
553 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
554 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
555 : #ifndef __ELPA_USE_FMA__
556 : // conjugate
557 : h1_imag = _AVX_XOR(h1_imag, sign);
558 : #endif
559 :
560 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
561 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
562 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
563 0 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
564 0 : q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
565 :
566 0 : tmp1 = _AVX_MUL(h1_imag, q1);
567 : #ifdef __ELPA_USE_FMA__
568 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
569 : #else
570 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
571 : #endif
572 0 : tmp2 = _AVX_MUL(h1_imag, q2);
573 : #ifdef __ELPA_USE_FMA__
574 0 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
575 : #else
576 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
577 : #endif
578 0 : tmp3 = _AVX_MUL(h1_imag, q3);
579 : #ifdef __ELPA_USE_FMA__
580 0 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
581 : #else
582 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
583 : #endif
584 :
585 0 : tmp4 = _AVX_MUL(h1_imag, q4);
586 : #ifdef __ELPA_USE_FMA__
587 0 : x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
588 : #else
589 : x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
590 : #endif
591 0 : tmp5 = _AVX_MUL(h1_imag, q5);
592 : #ifdef __ELPA_USE_FMA__
593 0 : x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
594 : #else
595 : x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
596 : #endif
597 : }
598 :
599 0 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
600 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
601 0 : h1_real = _AVX_XOR(h1_real, sign);
602 0 : h1_imag = _AVX_XOR(h1_imag, sign);
603 :
604 0 : tmp1 = _AVX_MUL(h1_imag, x1);
605 : #ifdef __ELPA_USE_FMA__
606 0 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
607 : #else
608 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
609 : #endif
610 0 : tmp2 = _AVX_MUL(h1_imag, x2);
611 : #ifdef __ELPA_USE_FMA__
612 0 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
613 : #else
614 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
615 : #endif
616 0 : tmp3 = _AVX_MUL(h1_imag, x3);
617 : #ifdef __ELPA_USE_FMA__
618 0 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
619 : #else
620 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
621 : #endif
622 :
623 0 : tmp4 = _AVX_MUL(h1_imag, x4);
624 : #ifdef __ELPA_USE_FMA__
625 0 : x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
626 : #else
627 : x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
628 : #endif
629 0 : tmp5 = _AVX_MUL(h1_imag, x5);
630 : #ifdef __ELPA_USE_FMA__
631 0 : x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
632 : #else
633 : x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
634 : #endif
635 :
636 0 : q1 = _AVX_LOAD(&q_dbl[0]);
637 0 : q2 = _AVX_LOAD(&q_dbl[offset]);
638 0 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
639 0 : q4 = _AVX_LOAD(&q_dbl[3*offset]);
640 0 : q5 = _AVX_LOAD(&q_dbl[4*offset]);
641 :
642 0 : q1 = _AVX_ADD(q1, x1);
643 0 : q2 = _AVX_ADD(q2, x2);
644 0 : q3 = _AVX_ADD(q3, x3);
645 0 : q4 = _AVX_ADD(q4, x4);
646 0 : q5 = _AVX_ADD(q5, x5);
647 :
648 : _AVX_STORE(&q_dbl[0], q1);
649 0 : _AVX_STORE(&q_dbl[offset], q2);
650 0 : _AVX_STORE(&q_dbl[2*offset], q3);
651 0 : _AVX_STORE(&q_dbl[3*offset], q4);
652 0 : _AVX_STORE(&q_dbl[4*offset], q5);
653 :
654 0 : for (i = 1; i < nb; i++)
655 : {
656 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
657 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
658 :
659 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
660 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
661 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
662 0 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
663 0 : q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
664 :
665 0 : tmp1 = _AVX_MUL(h1_imag, x1);
666 : #ifdef __ELPA_USE_FMA__
667 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
668 : #else
669 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
670 : #endif
671 0 : tmp2 = _AVX_MUL(h1_imag, x2);
672 : #ifdef __ELPA_USE_FMA__
673 0 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
674 : #else
675 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
676 : #endif
677 0 : tmp3 = _AVX_MUL(h1_imag, x3);
678 : #ifdef __ELPA_USE_FMA__
679 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
680 : #else
681 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
682 : #endif
683 :
684 0 : tmp4 = _AVX_MUL(h1_imag, x4);
685 : #ifdef __ELPA_USE_FMA__
686 0 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
687 : #else
688 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
689 : #endif
690 0 : tmp5 = _AVX_MUL(h1_imag, x5);
691 : #ifdef __ELPA_USE_FMA__
692 0 : q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
693 : #else
694 : q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
695 : #endif
696 :
697 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
698 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
699 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
700 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
701 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
702 : }
703 : }
704 :
705 :
706 : #ifdef DOUBLE_PRECISION_COMPLEX
707 : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
708 : #endif
709 : #ifdef SINGLE_PRECISION_COMPLEX
710 : static __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
711 : #endif
712 : {
713 :
714 : #ifdef DOUBLE_PRECISION_COMPLEX
715 0 : double* q_dbl = (double*)q;
716 0 : double* hh_dbl = (double*)hh;
717 : #endif
718 : #ifdef SINGLE_PRECISION_COMPLEX
719 2413568 : float* q_dbl = (float*)q;
720 2413568 : float* hh_dbl = (float*)hh;
721 : #endif
722 : __AVX_DATATYPE x1, x2, x3, x4;
723 : __AVX_DATATYPE q1, q2, q3, q4;
724 : __AVX_DATATYPE h1_real, h1_imag;
725 : __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
726 2413568 : int i=0;
727 :
728 : #ifdef DOUBLE_PRECISION_COMPLEX
729 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
730 : #endif
731 : #ifdef SINGLE_PRECISION_COMPLEX
732 2413568 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
733 : #endif
734 :
735 2413568 : x1 = _AVX_LOAD(&q_dbl[0]);
736 4827136 : x2 = _AVX_LOAD(&q_dbl[offset]);
737 4827136 : x3 = _AVX_LOAD(&q_dbl[2*offset]);
738 4827136 : x4 = _AVX_LOAD(&q_dbl[3*offset]);
739 :
740 74039296 : for (i = 1; i < nb; i++)
741 : {
742 143251456 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
743 143251456 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
744 : #ifndef __ELPA_USE_FMA__
745 : // conjugate
746 : h1_imag = _AVX_XOR(h1_imag, sign);
747 : #endif
748 :
749 :
750 143251456 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
751 143251456 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
752 143251456 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
753 143251456 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
754 :
755 71625728 : tmp1 = _AVX_MUL(h1_imag, q1);
756 : #ifdef __ELPA_USE_FMA__
757 214877184 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
758 : #else
759 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
760 : #endif
761 71625728 : tmp2 = _AVX_MUL(h1_imag, q2);
762 : #ifdef __ELPA_USE_FMA__
763 214877184 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
764 : #else
765 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
766 : #endif
767 :
768 71625728 : tmp3 = _AVX_MUL(h1_imag, q3);
769 : #ifdef __ELPA_USE_FMA__
770 214877184 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
771 : #else
772 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
773 : #endif
774 71625728 : tmp4 = _AVX_MUL(h1_imag, q4);
775 : #ifdef __ELPA_USE_FMA__
776 214877184 : x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
777 : #else
778 : x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
779 : #endif
780 : }
781 :
782 2413568 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
783 4827136 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
784 2413568 : h1_real = _AVX_XOR(h1_real, sign);
785 2413568 : h1_imag = _AVX_XOR(h1_imag, sign);
786 :
787 2413568 : tmp1 = _AVX_MUL(h1_imag, x1);
788 : #ifdef __ELPA_USE_FMA__
789 4827136 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
790 : #else
791 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
792 : #endif
793 2413568 : tmp2 = _AVX_MUL(h1_imag, x2);
794 : #ifdef __ELPA_USE_FMA__
795 4827136 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
796 : #else
797 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
798 : #endif
799 :
800 2413568 : tmp3 = _AVX_MUL(h1_imag, x3);
801 : #ifdef __ELPA_USE_FMA__
802 4827136 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
803 : #else
804 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
805 : #endif
806 2413568 : tmp4 = _AVX_MUL(h1_imag, x4);
807 : #ifdef __ELPA_USE_FMA__
808 4827136 : x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
809 : #else
810 : x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
811 : #endif
812 :
813 2413568 : q1 = _AVX_LOAD(&q_dbl[0]);
814 4827136 : q2 = _AVX_LOAD(&q_dbl[offset]);
815 4827136 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
816 4827136 : q4 = _AVX_LOAD(&q_dbl[3*offset]);
817 :
818 2413568 : q1 = _AVX_ADD(q1, x1);
819 2413568 : q2 = _AVX_ADD(q2, x2);
820 2413568 : q3 = _AVX_ADD(q3, x3);
821 2413568 : q4 = _AVX_ADD(q4, x4);
822 :
823 : _AVX_STORE(&q_dbl[0], q1);
824 2413568 : _AVX_STORE(&q_dbl[offset], q2);
825 2413568 : _AVX_STORE(&q_dbl[2*offset], q3);
826 2413568 : _AVX_STORE(&q_dbl[3*offset], q4);
827 :
828 74039296 : for (i = 1; i < nb; i++)
829 : {
830 143251456 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
831 143251456 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
832 :
833 143251456 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
834 143251456 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
835 143251456 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
836 143251456 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
837 :
838 71625728 : tmp1 = _AVX_MUL(h1_imag, x1);
839 : #ifdef __ELPA_USE_FMA__
840 214877184 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
841 : #else
842 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
843 : #endif
844 71625728 : tmp2 = _AVX_MUL(h1_imag, x2);
845 : #ifdef __ELPA_USE_FMA__
846 214877184 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
847 : #else
848 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
849 : #endif
850 :
851 71625728 : tmp3 = _AVX_MUL(h1_imag, x3);
852 : #ifdef __ELPA_USE_FMA__
853 214877184 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
854 : #else
855 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
856 : #endif
857 71625728 : tmp4 = _AVX_MUL(h1_imag, x4);
858 : #ifdef __ELPA_USE_FMA__
859 214877184 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
860 : #else
861 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
862 : #endif
863 :
864 71625728 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
865 71625728 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
866 71625728 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
867 71625728 : _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
868 : }
869 : }
870 :
871 : #ifdef DOUBLE_PRECISION_COMPLEX
872 : static __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
873 : #endif
874 : #ifdef SINGLE_PRECISION_COMPLEX
875 : static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
876 : #endif
877 : {
878 :
879 : #ifdef DOUBLE_PRECISION_COMPLEX
880 0 : double* q_dbl = (double*)q;
881 0 : double* hh_dbl = (double*)hh;
882 : #endif
883 : #ifdef SINGLE_PRECISION_COMPLEX
884 0 : float* q_dbl = (float*)q;
885 0 : float* hh_dbl = (float*)hh;
886 : #endif
887 : __AVX_DATATYPE x1, x2, x3, x4;
888 : __AVX_DATATYPE q1, q2, q3, q4;
889 : __AVX_DATATYPE h1_real, h1_imag;
890 : __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
891 0 : int i=0;
892 :
893 : #ifdef DOUBLE_PRECISION_COMPLEX
894 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
895 : #endif
896 : #ifdef SINGLE_PRECISION_COMPLEX
897 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
898 : #endif
899 :
900 0 : x1 = _AVX_LOAD(&q_dbl[0]);
901 0 : x2 = _AVX_LOAD(&q_dbl[offset]);
902 0 : x3 = _AVX_LOAD(&q_dbl[2*offset]);
903 :
904 0 : for (i = 1; i < nb; i++)
905 : {
906 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
907 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
908 : #ifndef __ELPA_USE_FMA__
909 : // conjugate
910 : h1_imag = _AVX_XOR(h1_imag, sign);
911 : #endif
912 :
913 :
914 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
915 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
916 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
917 :
918 0 : tmp1 = _AVX_MUL(h1_imag, q1);
919 : #ifdef __ELPA_USE_FMA__
920 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
921 : #else
922 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
923 : #endif
924 0 : tmp2 = _AVX_MUL(h1_imag, q2);
925 : #ifdef __ELPA_USE_FMA__
926 0 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
927 : #else
928 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
929 : #endif
930 :
931 0 : tmp3 = _AVX_MUL(h1_imag, q3);
932 : #ifdef __ELPA_USE_FMA__
933 0 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
934 : #else
935 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
936 : #endif
937 : }
938 :
939 0 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
940 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
941 0 : h1_real = _AVX_XOR(h1_real, sign);
942 0 : h1_imag = _AVX_XOR(h1_imag, sign);
943 :
944 0 : tmp1 = _AVX_MUL(h1_imag, x1);
945 : #ifdef __ELPA_USE_FMA__
946 0 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
947 : #else
948 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
949 : #endif
950 0 : tmp2 = _AVX_MUL(h1_imag, x2);
951 : #ifdef __ELPA_USE_FMA__
952 0 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
953 : #else
954 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
955 : #endif
956 :
957 0 : tmp3 = _AVX_MUL(h1_imag, x3);
958 : #ifdef __ELPA_USE_FMA__
959 0 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
960 : #else
961 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
962 : #endif
963 :
964 0 : q1 = _AVX_LOAD(&q_dbl[0]);
965 0 : q2 = _AVX_LOAD(&q_dbl[offset]);
966 0 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
967 :
968 0 : q1 = _AVX_ADD(q1, x1);
969 0 : q2 = _AVX_ADD(q2, x2);
970 0 : q3 = _AVX_ADD(q3, x3);
971 :
972 : _AVX_STORE(&q_dbl[0], q1);
973 0 : _AVX_STORE(&q_dbl[offset], q2);
974 0 : _AVX_STORE(&q_dbl[2*offset], q3);
975 :
976 0 : for (i = 1; i < nb; i++)
977 : {
978 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
979 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
980 :
981 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
982 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
983 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
984 :
985 0 : tmp1 = _AVX_MUL(h1_imag, x1);
986 : #ifdef __ELPA_USE_FMA__
987 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
988 : #else
989 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
990 : #endif
991 0 : tmp2 = _AVX_MUL(h1_imag, x2);
992 : #ifdef __ELPA_USE_FMA__
993 0 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
994 : #else
995 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
996 : #endif
997 :
998 0 : tmp3 = _AVX_MUL(h1_imag, x3);
999 : #ifdef __ELPA_USE_FMA__
1000 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1001 : #else
1002 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1003 : #endif
1004 :
1005 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1006 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
1007 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
1008 : }
1009 : }
1010 :
1011 : #ifdef DOUBLE_PRECISION_COMPLEX
1012 : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
1013 : #endif
1014 : #ifdef SINGLE_PRECISION_COMPLEX
1015 : static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
1016 : #endif
1017 : {
1018 :
1019 : #ifdef DOUBLE_PRECISION_COMPLEX
1020 4827136 : double* q_dbl = (double*)q;
1021 4827136 : double* hh_dbl = (double*)hh;
1022 : #endif
1023 : #ifdef SINGLE_PRECISION_COMPLEX
1024 0 : float* q_dbl = (float*)q;
1025 0 : float* hh_dbl = (float*)hh;
1026 : #endif
1027 : __AVX_DATATYPE x1, x2;
1028 : __AVX_DATATYPE q1, q2;
1029 : __AVX_DATATYPE h1_real, h1_imag;
1030 : __AVX_DATATYPE tmp1, tmp2;
1031 4827136 : int i=0;
1032 :
1033 : #ifdef DOUBLE_PRECISION_COMPLEX
1034 4827136 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1035 : #endif
1036 : #ifdef SINGLE_PRECISION_COMPLEX
1037 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1038 : #endif
1039 :
1040 4827136 : x1 = _AVX_LOAD(&q_dbl[0]);
1041 9654272 : x2 = _AVX_LOAD(&q_dbl[offset]);
1042 148078592 : for (i = 1; i < nb; i++)
1043 : {
1044 286502912 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
1045 286502912 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
1046 : #ifndef __ELPA_USE_FMA__
1047 : // conjugate
1048 : h1_imag = _AVX_XOR(h1_imag, sign);
1049 : #endif
1050 :
1051 286502912 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1052 286502912 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
1053 :
1054 143251456 : tmp1 = _AVX_MUL(h1_imag, q1);
1055 : #ifdef __ELPA_USE_FMA__
1056 429754368 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1057 : #else
1058 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1059 : #endif
1060 :
1061 143251456 : tmp2 = _AVX_MUL(h1_imag, q2);
1062 : #ifdef __ELPA_USE_FMA__
1063 429754368 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1064 : #else
1065 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1066 : #endif
1067 : }
1068 :
1069 4827136 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
1070 9654272 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
1071 4827136 : h1_real = _AVX_XOR(h1_real, sign);
1072 4827136 : h1_imag = _AVX_XOR(h1_imag, sign);
1073 :
1074 4827136 : tmp1 = _AVX_MUL(h1_imag, x1);
1075 : #ifdef __ELPA_USE_FMA__
1076 9654272 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1077 : #else
1078 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1079 : #endif
1080 :
1081 4827136 : tmp2 = _AVX_MUL(h1_imag, x2);
1082 : #ifdef __ELPA_USE_FMA__
1083 9654272 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1084 : #else
1085 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1086 : #endif
1087 :
1088 4827136 : q1 = _AVX_LOAD(&q_dbl[0]);
1089 9654272 : q2 = _AVX_LOAD(&q_dbl[offset]);
1090 :
1091 4827136 : q1 = _AVX_ADD(q1, x1);
1092 4827136 : q2 = _AVX_ADD(q2, x2);
1093 : _AVX_STORE(&q_dbl[0], q1);
1094 4827136 : _AVX_STORE(&q_dbl[offset], q2);
1095 148078592 : for (i = 1; i < nb; i++)
1096 : {
1097 286502912 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
1098 286502912 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
1099 :
1100 286502912 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1101 286502912 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
1102 :
1103 143251456 : tmp1 = _AVX_MUL(h1_imag, x1);
1104 : #ifdef __ELPA_USE_FMA__
1105 429754368 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1106 : #else
1107 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1108 : #endif
1109 :
1110 143251456 : tmp2 = _AVX_MUL(h1_imag, x2);
1111 : #ifdef __ELPA_USE_FMA__
1112 429754368 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1113 : #else
1114 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1115 : #endif
1116 :
1117 143251456 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1118 143251456 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
1119 : }
1120 : }
1121 :
1122 :
1123 : #ifdef DOUBLE_PRECISION_COMPLEX
1124 : static __forceinline void hh_trafo_complex_kernel_2_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
1125 : #endif
1126 : #ifdef SINGLE_PRECISION_COMPLEX
1127 : static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
1128 : #endif
1129 : {
1130 :
1131 : #ifdef DOUBLE_PRECISION_COMPLEX
1132 0 : double* q_dbl = (double*)q;
1133 0 : double* hh_dbl = (double*)hh;
1134 : #endif
1135 : #ifdef SINGLE_PRECISION_COMPLEX
1136 0 : float* q_dbl = (float*)q;
1137 0 : float* hh_dbl = (float*)hh;
1138 : #endif
1139 : __AVX_DATATYPE x1, x2;
1140 : __AVX_DATATYPE q1, q2;
1141 : __AVX_DATATYPE h1_real, h1_imag;
1142 : __AVX_DATATYPE tmp1, tmp2;
1143 0 : int i=0;
1144 :
1145 : #ifdef DOUBLE_PRECISION_COMPLEX
1146 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1147 : #endif
1148 : #ifdef SINGLE_PRECISION_COMPLEX
1149 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1150 : #endif
1151 :
1152 0 : x1 = _AVX_LOAD(&q_dbl[0]);
1153 0 : for (i = 1; i < nb; i++)
1154 : {
1155 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
1156 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
1157 : #ifndef __ELPA_USE_FMA__
1158 : // conjugate
1159 : h1_imag = _AVX_XOR(h1_imag, sign);
1160 : #endif
1161 :
1162 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1163 :
1164 0 : tmp1 = _AVX_MUL(h1_imag, q1);
1165 : #ifdef __ELPA_USE_FMA__
1166 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1167 : #else
1168 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1169 : #endif
1170 :
1171 : }
1172 :
1173 0 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
1174 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
1175 0 : h1_real = _AVX_XOR(h1_real, sign);
1176 0 : h1_imag = _AVX_XOR(h1_imag, sign);
1177 :
1178 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1179 : #ifdef __ELPA_USE_FMA__
1180 0 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1181 : #else
1182 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1183 : #endif
1184 :
1185 0 : q1 = _AVX_LOAD(&q_dbl[0]);
1186 :
1187 0 : q1 = _AVX_ADD(q1, x1);
1188 : _AVX_STORE(&q_dbl[0], q1);
1189 0 : for (i = 1; i < nb; i++)
1190 : {
1191 0 : h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
1192 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
1193 :
1194 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1195 :
1196 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1197 : #ifdef __ELPA_USE_FMA__
1198 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1199 : #else
1200 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1201 : #endif
1202 :
1203 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1204 : }
1205 : }
1206 :
1207 :
1208 :
1209 :
|