Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 : #include "config-f90.h"
63 :
64 : #include <complex.h>
65 : #include <x86intrin.h>
66 : #include <stdio.h>
67 : #include <stdlib.h>
68 :
69 : #define __forceinline __attribute__((always_inline))
70 :
71 : #ifdef DOUBLE_PRECISION_COMPLEX
72 : #define offset 4
73 : #define __AVX_DATATYPE __m256d
74 : #define _AVX_LOAD _mm256_load_pd
75 : #define _AVX_STORE _mm256_store_pd
76 : #define _AVX_ADD _mm256_add_pd
77 : #define _AVX_MUL _mm256_mul_pd
78 : #define _AVX_ADDSUB _mm256_addsub_pd
79 : #define _AVX_XOR _mm256_xor_pd
80 : #define _AVX_BROADCAST _mm256_broadcast_sd
81 : #define _AVX_SET1 _mm256_set1_pd
82 : #define _AVX_SHUFFLE _mm256_shuffle_pd
83 : #define _SHUFFLE 0x5
84 : #define _CAST _mm256_castpd256_pd128
85 :
86 : #ifdef HAVE_AVX2
87 :
88 : #ifdef __FMA4__
89 : #define __ELPA_USE_FMA__
90 : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
91 : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
92 : #endif
93 :
94 : #ifdef __AVX2__
95 : #define __ELPA_USE_FMA__
96 : #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
97 : #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
98 : #endif
99 :
100 : #define _AVX_FMADDSUB _mm256_FMADDSUB_pd
101 : #define _AVX_FMSUBADD _mm256_FMSUBADD_pd
102 : #endif
103 : #endif /* DOUBLE_PRECISION_COMPLEX */
104 :
105 : #ifdef SINGLE_PRECISION_COMPLEX
106 : #define offset 8
107 : #define __AVX_DATATYPE __m256
108 : #define _AVX_LOAD _mm256_load_ps
109 : #define _AVX_STORE _mm256_store_ps
110 : #define _AVX_ADD _mm256_add_ps
111 : #define _AVX_MUL _mm256_mul_ps
112 : #define _AVX_ADDSUB _mm256_addsub_ps
113 : #define _AVX_XOR _mm256_xor_ps
114 : #define _AVX_BROADCAST _mm256_broadcast_ss
115 : #define _AVX_SET1 _mm256_set1_ps
116 : #define _AVX_SHUFFLE _mm256_shuffle_ps
117 : #define _SHUFFLE 0xb1
118 : #define _CAST _mm256_castps256_ps128
119 : #ifdef HAVE_AVX2
120 :
121 : #ifdef __FMA4__
122 : #define __ELPA_USE_FMA__
123 : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
124 : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
125 : #endif
126 :
127 : #ifdef __AVX2__
128 : #define __ELPA_USE_FMA__
129 : #define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
130 : #define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
131 : #endif
132 :
133 : #define _AVX_FMADDSUB _mm256_FMADDSUB_ps
134 : #define _AVX_FMSUBADD _mm256_FMSUBADD_ps
135 : #endif
136 : #endif /* SINGLE_PRECISION_COMPLEX */
137 :
138 : #ifdef DOUBLE_PRECISION_COMPLEX
139 : //Forward declaration
140 : static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
141 : static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
142 : static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
143 : static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
144 : #endif
145 : #ifdef SINGLE_PRECISION_COMPLEX
146 : //Forward declaration
147 : static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
148 : static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
149 : static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
150 : static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
151 : #endif
152 :
153 : #ifdef DOUBLE_PRECISION_COMPLEX
154 : /*
155 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
156 : !f> interface
157 : !f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
158 : !f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
159 : !f> use, intrinsic :: iso_c_binding
160 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
161 : !f> ! complex(kind=c_double_complex) :: q(*)
162 : !f> type(c_ptr), value :: q
163 : !f> complex(kind=c_double_complex) :: hh(pnb,2)
164 : !f> end subroutine
165 : !f> end interface
166 : !f>#endif
167 : */
168 : #endif
169 : #ifdef SINGLE_PRECISION_COMPLEX
170 : /*
171 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
172 : !f> interface
173 : !f> subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
174 : !f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
175 : !f> use, intrinsic :: iso_c_binding
176 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
177 : !f> ! complex(kind=c_float_complex) :: q(*)
178 : !f> type(c_ptr), value :: q
179 : !f> complex(kind=c_float_complex) :: hh(pnb,2)
180 : !f> end subroutine
181 : !f> end interface
182 : !f>#endif
183 : */
184 : #endif
185 :
186 : #ifdef DOUBLE_PRECISION_COMPLEX
187 1308672 : void double_hh_trafo_complex_avx_avx2_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
188 : #endif
189 : #ifdef SINGLE_PRECISION_COMPLEX
190 654336 : void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
191 : #endif
192 : {
193 : int i;
194 1963008 : int nb = *pnb;
195 1963008 : int nq = *pldq;
196 1963008 : int ldq = *pldq;
197 1963008 : int ldh = *pldh;
198 : int worked_on;
199 :
200 1963008 : worked_on = 0;
201 :
202 : #ifdef DOUBLE_PRECISION_COMPLEX
203 1308672 : double complex s = conj(hh[(ldh)+1])*1.0;
204 : #endif
205 : #ifdef SINGLE_PRECISION_COMPLEX
206 654336 : float complex s = conj(hh[(ldh)+1])*1.0f;
207 : #endif
208 :
209 60853248 : for (i = 2; i < nb; i++)
210 : {
211 58890240 : s += hh[i-1] * conj(hh[(i+ldh)]);
212 : }
213 :
214 : #ifdef DOUBLE_PRECISION_COMPLEX
215 7852032 : for (i = 0; i < nq-6; i+=8)
216 : {
217 6543360 : hh_trafo_complex_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
218 6543360 : worked_on += 8;
219 : }
220 : #endif
221 : #ifdef SINGLE_PRECISION_COMPLEX
222 1963008 : for (i = 0; i < nq-12; i+=16)
223 : {
224 1308672 : hh_trafo_complex_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s , s);
225 1308672 : worked_on += 16;
226 : }
227 : #endif
228 1963008 : if (nq-i == 0) {
229 1308672 : return;
230 : }
231 : #ifdef DOUBLE_PRECISION_COMPLEX
232 0 : if (nq-i == 6) {
233 0 : hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
234 0 : worked_on += 6;
235 : }
236 : #endif
237 : #ifdef SINGLE_PRECISION_COMPLEX
238 654336 : if (nq-i == 12) {
239 0 : hh_trafo_complex_kernel_12_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
240 0 : worked_on += 12;
241 : }
242 : #endif
243 :
244 : #ifdef DOUBLE_PRECISION_COMPLEX
245 0 : if (nq-i == 4) {
246 0 : hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
247 0 : worked_on += 4;
248 : }
249 : #endif
250 : #ifdef SINGLE_PRECISION_COMPLEX
251 654336 : if (nq-i == 8) {
252 654336 : hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
253 654336 : worked_on += 8;
254 : }
255 : #endif
256 : #ifdef DOUBLE_PRECISION_COMPLEX
257 0 : if (nq-i == 2) {
258 0 : hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
259 0 : worked_on += 2;
260 : }
261 : #endif
262 : #ifdef SINGLE_PRECISION_COMPLEX
263 654336 : if (nq-i == 4) {
264 0 : hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
265 0 : worked_on += 4;
266 : }
267 : #endif
268 : #ifdef WITH_DEBUG
269 : if (worked_on != nq) {
270 : printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
271 : abort();
272 : }
273 : #endif
274 : }
275 :
276 : #ifdef DOUBLE_PRECISION_COMPLEX
277 : static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
278 : #endif
279 : #ifdef SINGLE_PRECISION_COMPLEX
280 : static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
281 : #endif
282 : {
283 :
284 : #ifdef DOUBLE_PRECISION_COMPLEX
285 6543360 : double* q_dbl = (double*)q;
286 6543360 : double* hh_dbl = (double*)hh;
287 6543360 : double* s_dbl = (double*)(&s);
288 : #endif
289 : #ifdef SINGLE_PRECISION_COMPLEX
290 1308672 : float* q_dbl = (float*)q;
291 1308672 : float* hh_dbl = (float*)hh;
292 1308672 : float* s_dbl = (float*)(&s);
293 : #endif
294 : __AVX_DATATYPE x1, x2, x3, x4;
295 : __AVX_DATATYPE y1, y2, y3, y4;
296 : __AVX_DATATYPE q1, q2, q3, q4;
297 : __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
298 : __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
299 7852032 : int i=0;
300 :
301 : #ifdef DOUBLE_PRECISION_COMPLEX
302 6543360 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
303 : #endif
304 : #ifdef SINGLE_PRECISION_COMPLEX
305 1308672 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
306 : #endif
307 :
308 15704064 : x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
309 15704064 : x2 = _AVX_LOAD(&q_dbl[(2*ldq)+offset]);
310 15704064 : x3 = _AVX_LOAD(&q_dbl[(2*ldq)+2*offset]);
311 15704064 : x4 = _AVX_LOAD(&q_dbl[(2*ldq)+3*offset]);
312 15704064 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
313 15704064 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
314 : #ifndef __ELPA_USE_FMA__
315 : // conjugate
316 : h2_imag = _AVX_XOR(h2_imag, sign);
317 : #endif
318 :
319 7852032 : y1 = _AVX_LOAD(&q_dbl[0]);
320 15704064 : y2 = _AVX_LOAD(&q_dbl[offset]);
321 15704064 : y3 = _AVX_LOAD(&q_dbl[2*offset]);
322 15704064 : y4 = _AVX_LOAD(&q_dbl[3*offset]);
323 :
324 7852032 : tmp1 = _AVX_MUL(h2_imag, x1);
325 : #ifdef __ELPA_USE_FMA__
326 23556096 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
327 : #else
328 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
329 : #endif
330 7852032 : tmp2 = _AVX_MUL(h2_imag, x2);
331 : #ifdef __ELPA_USE_FMA__
332 23556096 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
333 : #else
334 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
335 : #endif
336 :
337 7852032 : tmp3 = _AVX_MUL(h2_imag, x3);
338 : #ifdef __ELPA_USE_FMA__
339 23556096 : y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
340 : #else
341 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
342 : #endif
343 7852032 : tmp4 = _AVX_MUL(h2_imag, x4);
344 : #ifdef __ELPA_USE_FMA__
345 23556096 : y4 = _AVX_ADD(y4, _AVX_FMSUBADD(h2_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
346 : #else
347 : y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
348 : #endif
349 :
350 243412992 : for (i = 2; i < nb; i++)
351 : {
352 471121920 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
353 471121920 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
354 471121920 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
355 471121920 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
356 :
357 471121920 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
358 471121920 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
359 : #ifndef __ELPA_USE_FMA__
360 : // conjugate
361 : h1_imag = _AVX_XOR(h1_imag, sign);
362 : #endif
363 :
364 235560960 : tmp1 = _AVX_MUL(h1_imag, q1);
365 : #ifdef __ELPA_USE_FMA__
366 706682880 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
367 : #else
368 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
369 : #endif
370 235560960 : tmp2 = _AVX_MUL(h1_imag, q2);
371 : #ifdef __ELPA_USE_FMA__
372 706682880 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
373 : #else
374 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
375 : #endif
376 :
377 235560960 : tmp3 = _AVX_MUL(h1_imag, q3);
378 : #ifdef __ELPA_USE_FMA__
379 706682880 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
380 : #else
381 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
382 : #endif
383 235560960 : tmp4 = _AVX_MUL(h1_imag, q4);
384 : #ifdef __ELPA_USE_FMA__
385 706682880 : x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
386 : #else
387 : x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
388 : #endif
389 :
390 471121920 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
391 471121920 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
392 : #ifndef __ELPA_USE_FMA__
393 : // conjugate
394 : h2_imag = _AVX_XOR(h2_imag, sign);
395 : #endif
396 :
397 235560960 : tmp1 = _AVX_MUL(h2_imag, q1);
398 : #ifdef __ELPA_USE_FMA__
399 706682880 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
400 : #else
401 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
402 : #endif
403 235560960 : tmp2 = _AVX_MUL(h2_imag, q2);
404 : #ifdef __ELPA_USE_FMA__
405 706682880 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
406 : #else
407 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
408 : #endif
409 :
410 235560960 : tmp3 = _AVX_MUL(h2_imag, q3);
411 : #ifdef __ELPA_USE_FMA__
412 706682880 : y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
413 : #else
414 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
415 : #endif
416 235560960 : tmp4 = _AVX_MUL(h2_imag, q4);
417 : #ifdef __ELPA_USE_FMA__
418 706682880 : y4 = _AVX_ADD(y4, _AVX_FMSUBADD(h2_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
419 : #else
420 : y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
421 : #endif
422 : }
423 :
424 15704064 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
425 15704064 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
426 : #ifndef __ELPA_USE_FMA__
427 : // conjugate
428 : h1_imag = _AVX_XOR(h1_imag, sign);
429 : #endif
430 :
431 15704064 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
432 15704064 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
433 15704064 : q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
434 15704064 : q4 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
435 :
436 7852032 : tmp1 = _AVX_MUL(h1_imag, q1);
437 : #ifdef __ELPA_USE_FMA__
438 23556096 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
439 : #else
440 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
441 : #endif
442 7852032 : tmp2 = _AVX_MUL(h1_imag, q2);
443 : #ifdef __ELPA_USE_FMA__
444 23556096 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
445 : #else
446 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
447 : #endif
448 :
449 7852032 : tmp3 = _AVX_MUL(h1_imag, q3);
450 : #ifdef __ELPA_USE_FMA__
451 23556096 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
452 : #else
453 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
454 : #endif
455 7852032 : tmp4 = _AVX_MUL(h1_imag, q4);
456 : #ifdef __ELPA_USE_FMA__
457 23556096 : x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
458 : #else
459 : x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
460 : #endif
461 :
462 7852032 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
463 15704064 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
464 7852032 : h1_real = _AVX_XOR(h1_real, sign);
465 7852032 : h1_imag = _AVX_XOR(h1_imag, sign);
466 :
467 7852032 : tmp1 = _AVX_MUL(h1_imag, x1);
468 : #ifdef __ELPA_USE_FMA__
469 15704064 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
470 : #else
471 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
472 : #endif
473 7852032 : tmp2 = _AVX_MUL(h1_imag, x2);
474 : #ifdef __ELPA_USE_FMA__
475 15704064 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
476 : #else
477 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
478 : #endif
479 :
480 7852032 : tmp3 = _AVX_MUL(h1_imag, x3);
481 : #ifdef __ELPA_USE_FMA__
482 15704064 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
483 : #else
484 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
485 : #endif
486 7852032 : tmp4 = _AVX_MUL(h1_imag, x4);
487 : #ifdef __ELPA_USE_FMA__
488 15704064 : x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
489 : #else
490 : x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
491 : #endif
492 :
493 15704064 : h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
494 15704064 : h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
495 15704064 : h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
496 15704064 : h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
497 :
498 7852032 : h1_real = _AVX_XOR(h1_real, sign);
499 7852032 : h1_imag = _AVX_XOR(h1_imag, sign);
500 7852032 : h2_real = _AVX_XOR(h2_real, sign);
501 7852032 : h2_imag = _AVX_XOR(h2_imag, sign);
502 :
503 : #ifdef DOUBLE_PRECISION_COMPLEX
504 13086720 : tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
505 : #endif
506 : #ifdef SINGLE_PRECISION_COMPLEX
507 5234688 : tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
508 2617344 : s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
509 : #endif
510 :
511 15704064 : tmp1 = _AVX_MUL(h2_imag, tmp2);
512 : #ifdef __ELPA_USE_FMA__
513 15704064 : tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
514 : #else
515 : tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
516 : #endif
517 :
518 15704064 : h2_real = _AVX_SET1(tmp2[0]);
519 15704064 : h2_imag = _AVX_SET1(tmp2[1]);
520 :
521 7852032 : tmp1 = _AVX_MUL(h1_imag, y1);
522 : #ifdef __ELPA_USE_FMA__
523 15704064 : y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
524 : #else
525 : y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
526 : #endif
527 7852032 : tmp2 = _AVX_MUL(h1_imag, y2);
528 : #ifdef __ELPA_USE_FMA__
529 15704064 : y2 = _AVX_FMADDSUB(h1_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
530 : #else
531 : y2 = _AVX_ADDSUB( _AVX_MUL(h1_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
532 : #endif
533 :
534 7852032 : tmp3 = _AVX_MUL(h1_imag, y3);
535 : #ifdef __ELPA_USE_FMA__
536 15704064 : y3 = _AVX_FMADDSUB(h1_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
537 : #else
538 : y3 = _AVX_ADDSUB( _AVX_MUL(h1_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
539 : #endif
540 7852032 : tmp4 = _AVX_MUL(h1_imag, y4);
541 : #ifdef __ELPA_USE_FMA__
542 15704064 : y4 = _AVX_FMADDSUB(h1_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
543 : #else
544 : y4 = _AVX_ADDSUB( _AVX_MUL(h1_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
545 : #endif
546 :
547 7852032 : tmp1 = _AVX_MUL(h2_imag, x1);
548 : #ifdef __ELPA_USE_FMA__
549 23556096 : y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
550 : #else
551 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
552 : #endif
553 7852032 : tmp2 = _AVX_MUL(h2_imag, x2);
554 : #ifdef __ELPA_USE_FMA__
555 23556096 : y2 = _AVX_ADD(y2, _AVX_FMADDSUB(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
556 : #else
557 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
558 : #endif
559 :
560 7852032 : tmp3 = _AVX_MUL(h2_imag, x3);
561 : #ifdef __ELPA_USE_FMA__
562 23556096 : y3 = _AVX_ADD(y3, _AVX_FMADDSUB(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
563 : #else
564 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
565 : #endif
566 7852032 : tmp4 = _AVX_MUL(h2_imag, x4);
567 : #ifdef __ELPA_USE_FMA__
568 23556096 : y4 = _AVX_ADD(y4, _AVX_FMADDSUB(h2_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
569 : #else
570 : y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
571 : #endif
572 :
573 7852032 : q1 = _AVX_LOAD(&q_dbl[0]);
574 15704064 : q2 = _AVX_LOAD(&q_dbl[offset]);
575 15704064 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
576 15704064 : q4 = _AVX_LOAD(&q_dbl[3*offset]);
577 :
578 7852032 : q1 = _AVX_ADD(q1, y1);
579 7852032 : q2 = _AVX_ADD(q2, y2);
580 7852032 : q3 = _AVX_ADD(q3, y3);
581 7852032 : q4 = _AVX_ADD(q4, y4);
582 :
583 :
584 : _AVX_STORE(&q_dbl[0], q1);
585 7852032 : _AVX_STORE(&q_dbl[offset], q2);
586 7852032 : _AVX_STORE(&q_dbl[2*offset], q3);
587 7852032 : _AVX_STORE(&q_dbl[3*offset], q4);
588 :
589 15704064 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
590 15704064 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
591 :
592 15704064 : q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
593 15704064 : q2 = _AVX_LOAD(&q_dbl[(ldq*2)+offset]);
594 15704064 : q3 = _AVX_LOAD(&q_dbl[(ldq*2)+2*offset]);
595 15704064 : q4 = _AVX_LOAD(&q_dbl[(ldq*2)+3*offset]);
596 :
597 7852032 : q1 = _AVX_ADD(q1, x1);
598 7852032 : q2 = _AVX_ADD(q2, x2);
599 7852032 : q3 = _AVX_ADD(q3, x3);
600 7852032 : q4 = _AVX_ADD(q4, x4);
601 :
602 7852032 : tmp1 = _AVX_MUL(h2_imag, y1);
603 : #ifdef __ELPA_USE_FMA__
604 23556096 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
605 : #else
606 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
607 : #endif
608 7852032 : tmp2 = _AVX_MUL(h2_imag, y2);
609 : #ifdef __ELPA_USE_FMA_
610 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
611 : #else
612 31408128 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
613 : #endif
614 :
615 7852032 : tmp3 = _AVX_MUL(h2_imag, y3);
616 : #ifdef __ELPA_USE_FMA__
617 23556096 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
618 : #else
619 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
620 : #endif
621 7852032 : tmp4 = _AVX_MUL(h2_imag, y4);
622 : #ifdef __ELPA_USE_FMA__
623 23556096 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h2_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
624 : #else
625 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h2_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
626 : #endif
627 :
628 7852032 : _AVX_STORE(&q_dbl[(ldq*2)+0], q1);
629 7852032 : _AVX_STORE(&q_dbl[(ldq*2)+offset], q2);
630 7852032 : _AVX_STORE(&q_dbl[(ldq*2)+2*offset], q3);
631 7852032 : _AVX_STORE(&q_dbl[(ldq*2)+3*offset], q4);
632 :
633 243412992 : for (i = 2; i < nb; i++)
634 : {
635 :
636 471121920 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
637 471121920 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
638 471121920 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
639 471121920 : q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
640 :
641 471121920 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
642 471121920 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
643 :
644 235560960 : tmp1 = _AVX_MUL(h1_imag, x1);
645 : #ifdef __ELPA_USE_FMA__
646 706682880 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
647 : #else
648 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
649 : #endif
650 235560960 : tmp2 = _AVX_MUL(h1_imag, x2);
651 : #ifdef __ELPA_USE_FMA__
652 706682880 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
653 : #else
654 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
655 : #endif
656 :
657 235560960 : tmp3 = _AVX_MUL(h1_imag, x3);
658 : #ifdef __ELPA_USE_FMA__
659 706682880 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
660 : #else
661 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
662 : #endif
663 235560960 : tmp4 = _AVX_MUL(h1_imag, x4);
664 : #ifdef __ELPA_USE_FMA__
665 706682880 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
666 : #else
667 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
668 : #endif
669 :
670 471121920 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
671 471121920 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
672 :
673 235560960 : tmp1 = _AVX_MUL(h2_imag, y1);
674 : #ifdef __ELPA_USE_FMA__
675 706682880 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
676 : #else
677 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
678 : #endif
679 235560960 : tmp2 = _AVX_MUL(h2_imag, y2);
680 : #ifdef __ELPA_USE_FMA__
681 706682880 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
682 : #else
683 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
684 : #endif
685 :
686 235560960 : tmp3 = _AVX_MUL(h2_imag, y3);
687 : #ifdef __ELPA_USE_FMA__
688 706682880 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
689 : #else
690 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
691 : #endif
692 235560960 : tmp4 = _AVX_MUL(h2_imag, y4);
693 : #ifdef __ELPA_USE_FMA__
694 706682880 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h2_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
695 : #else
696 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h2_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
697 : #endif
698 :
699 235560960 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
700 235560960 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
701 235560960 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
702 235560960 : _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
703 : }
704 15704064 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
705 15704064 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
706 :
707 15704064 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
708 15704064 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
709 15704064 : q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
710 15704064 : q4 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
711 :
712 7852032 : tmp1 = _AVX_MUL(h1_imag, x1);
713 : #ifdef __ELPA_USE_FMA__
714 23556096 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
715 : #else
716 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
717 : #endif
718 7852032 : tmp2 = _AVX_MUL(h1_imag, x2);
719 : #ifdef __ELPA_USE_FMA__
720 23556096 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
721 : #else
722 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
723 : #endif
724 :
725 7852032 : tmp3 = _AVX_MUL(h1_imag, x3);
726 : #ifdef __ELPA_USE_FMA__
727 23556096 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
728 : #else
729 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
730 : #endif
731 7852032 : tmp4 = _AVX_MUL(h1_imag, x4);
732 : #ifdef __ELPA_USE_FMA__
733 23556096 : q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
734 : #else
735 : q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
736 : #endif
737 :
738 7852032 : _AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
739 7852032 : _AVX_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
740 7852032 : _AVX_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
741 7852032 : _AVX_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
742 : }
743 :
744 : #ifdef DOUBLE_PRECISION_COMPLEX
745 : static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
746 : #endif
747 : #ifdef SINGLE_PRECISION_COMPLEX
748 : static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
749 : #endif
750 :
751 : {
752 : #ifdef DOUBLE_PRECISION_COMPLEX
753 0 : double* q_dbl = (double*)q;
754 0 : double* hh_dbl = (double*)hh;
755 0 : double* s_dbl = (double*)(&s);
756 : #endif
757 : #ifdef SINGLE_PRECISION_COMPLEX
758 0 : float* q_dbl = (float*)q;
759 0 : float* hh_dbl = (float*)hh;
760 0 : float* s_dbl = (float*)(&s);
761 : #endif
762 : __AVX_DATATYPE x1, x2, x3;
763 : __AVX_DATATYPE y1, y2, y3;
764 : __AVX_DATATYPE q1, q2, q3;
765 : __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
766 : __AVX_DATATYPE tmp1, tmp2, tmp3;
767 0 : int i=0;
768 :
769 : #ifdef DOUBLE_PRECISION_COMPLEX
770 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
771 : #endif
772 : #ifdef SINGLE_PRECISION_COMPLEX
773 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
774 : #endif
775 0 : x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
776 0 : x2 = _AVX_LOAD(&q_dbl[(2*ldq)+offset]);
777 0 : x3 = _AVX_LOAD(&q_dbl[(2*ldq)+2*offset]);
778 :
779 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
780 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
781 : #ifndef __ELPA_USE_FMA__
782 : // conjugate
783 : h2_imag = _AVX_XOR(h2_imag, sign);
784 : #endif
785 :
786 0 : y1 = _AVX_LOAD(&q_dbl[0]);
787 0 : y2 = _AVX_LOAD(&q_dbl[offset]);
788 0 : y3 = _AVX_LOAD(&q_dbl[2*offset]);
789 :
790 0 : tmp1 = _AVX_MUL(h2_imag, x1);
791 : #ifdef __ELPA_USE_FMA__
792 0 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
793 : #else
794 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
795 : #endif
796 0 : tmp2 = _AVX_MUL(h2_imag, x2);
797 : #ifdef __ELPA_USE_FMA__
798 0 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
799 : #else
800 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
801 : #endif
802 0 : tmp3 = _AVX_MUL(h2_imag, x3);
803 : #ifdef __ELPA_USE_FMA__
804 0 : y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
805 : #else
806 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
807 : #endif
808 :
809 0 : for (i = 2; i < nb; i++)
810 : {
811 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
812 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
813 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
814 :
815 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
816 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
817 : #ifndef __ELPA_USE_FMA__
818 : // conjugate
819 : h1_imag = _AVX_XOR(h1_imag, sign);
820 : #endif
821 :
822 0 : tmp1 = _AVX_MUL(h1_imag, q1);
823 : #ifdef __ELPA_USE_FMA__
824 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
825 : #else
826 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
827 : #endif
828 0 : tmp2 = _AVX_MUL(h1_imag, q2);
829 : #ifdef __ELPA_USE_FMA__
830 0 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
831 : #else
832 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
833 : #endif
834 0 : tmp3 = _AVX_MUL(h1_imag, q3);
835 : #ifdef __ELPA_USE_FMA__
836 0 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
837 : #else
838 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
839 : #endif
840 :
841 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
842 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
843 : #ifndef __ELPA_USE_FMA__
844 : // conjugate
845 : h2_imag = _AVX_XOR(h2_imag, sign);
846 : #endif
847 :
848 0 : tmp1 = _AVX_MUL(h2_imag, q1);
849 : #ifdef __ELPA_USE_FMA__
850 0 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
851 : #else
852 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
853 : #endif
854 0 : tmp2 = _AVX_MUL(h2_imag, q2);
855 : #ifdef __ELPA_USE_FMA__
856 0 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
857 : #else
858 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
859 : #endif
860 0 : tmp3 = _AVX_MUL(h2_imag, q3);
861 : #ifdef __ELPA_USE_FMA__
862 0 : y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
863 : #else
864 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
865 : #endif
866 : }
867 :
868 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
869 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
870 : #ifndef __ELPA_USE_FMA__
871 : // conjugate
872 : h1_imag = _AVX_XOR(h1_imag, sign);
873 : #endif
874 :
875 0 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
876 0 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
877 0 : q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
878 :
879 0 : tmp1 = _AVX_MUL(h1_imag, q1);
880 : #ifdef __ELPA_USE_FMA__
881 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
882 : #else
883 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
884 : #endif
885 0 : tmp2 = _AVX_MUL(h1_imag, q2);
886 : #ifdef __ELPA_USE_FMA__
887 0 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
888 : #else
889 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
890 : #endif
891 0 : tmp3 = _AVX_MUL(h1_imag, q3);
892 : #ifdef __ELPA_USE_FMA__
893 0 : x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
894 : #else
895 : x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
896 : #endif
897 :
898 0 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
899 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
900 0 : h1_real = _AVX_XOR(h1_real, sign);
901 0 : h1_imag = _AVX_XOR(h1_imag, sign);
902 :
903 0 : tmp1 = _AVX_MUL(h1_imag, x1);
904 : #ifdef __ELPA_USE_FMA__
905 0 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
906 : #else
907 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
908 : #endif
909 0 : tmp2 = _AVX_MUL(h1_imag, x2);
910 : #ifdef __ELPA_USE_FMA__
911 0 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
912 : #else
913 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
914 : #endif
915 0 : tmp3 = _AVX_MUL(h1_imag, x3);
916 : #ifdef __ELPA_USE_FMA__
917 0 : x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
918 : #else
919 : x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
920 : #endif
921 :
922 0 : h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
923 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
924 0 : h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
925 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
926 :
927 0 : h1_real = _AVX_XOR(h1_real, sign);
928 0 : h1_imag = _AVX_XOR(h1_imag, sign);
929 0 : h2_real = _AVX_XOR(h2_real, sign);
930 0 : h2_imag = _AVX_XOR(h2_imag, sign);
931 :
932 : #ifdef DOUBLE_PRECISION_COMPLEX
933 0 : tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
934 : #endif
935 : #ifdef SINGLE_PRECISION_COMPLEX
936 0 : tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
937 0 : s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
938 : #endif
939 :
940 0 : tmp1 = _AVX_MUL(h2_imag, tmp2);
941 : #ifdef __ELPA_USE_FMA__
942 0 : tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
943 : #else
944 : tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
945 : #endif
946 0 : h2_real = _AVX_SET1(tmp2[0]);
947 0 : h2_imag = _AVX_SET1(tmp2[1]);
948 :
949 0 : tmp1 = _AVX_MUL(h1_imag, y1);
950 : #ifdef __ELPA_USE_FMA__
951 0 : y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
952 : #else
953 : y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
954 : #endif
955 0 : tmp2 = _AVX_MUL(h1_imag, y2);
956 : #ifdef __ELPA_USE_FMA__
957 0 : y2 = _AVX_FMADDSUB(h1_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
958 : #else
959 : y2 = _AVX_ADDSUB( _AVX_MUL(h1_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
960 : #endif
961 0 : tmp3 = _AVX_MUL(h1_imag, y3);
962 : #ifdef __ELPA_USE_FMA__
963 0 : y3 = _AVX_FMADDSUB(h1_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
964 : #else
965 : y3 = _AVX_ADDSUB( _AVX_MUL(h1_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
966 : #endif
967 :
968 0 : tmp1 = _AVX_MUL(h2_imag, x1);
969 : #ifdef __ELPA_USE_FMA__
970 0 : y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
971 : #else
972 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
973 : #endif
974 0 : tmp2 = _AVX_MUL(h2_imag, x2);
975 : #ifdef __ELPA_USE_FMA__
976 0 : y2 = _AVX_ADD(y2, _AVX_FMADDSUB(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
977 : #else
978 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
979 : #endif
980 0 : tmp3 = _AVX_MUL(h2_imag, x3);
981 : #ifdef __ELPA_USE_FMA__
982 0 : y3 = _AVX_ADD(y3, _AVX_FMADDSUB(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
983 : #else
984 : y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
985 : #endif
986 :
987 0 : q1 = _AVX_LOAD(&q_dbl[0]);
988 0 : q2 = _AVX_LOAD(&q_dbl[offset]);
989 0 : q3 = _AVX_LOAD(&q_dbl[2*offset]);
990 :
991 0 : q1 = _AVX_ADD(q1, y1);
992 0 : q2 = _AVX_ADD(q2, y2);
993 0 : q3 = _AVX_ADD(q3, y3);
994 :
995 : _AVX_STORE(&q_dbl[0], q1);
996 0 : _AVX_STORE(&q_dbl[offset], q2);
997 0 : _AVX_STORE(&q_dbl[2*offset], q3);
998 :
999 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
1000 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1001 :
1002 0 : q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
1003 0 : q2 = _AVX_LOAD(&q_dbl[(ldq*2)+offset]);
1004 0 : q3 = _AVX_LOAD(&q_dbl[(ldq*2)+2*offset]);
1005 :
1006 0 : q1 = _AVX_ADD(q1, x1);
1007 0 : q2 = _AVX_ADD(q2, x2);
1008 0 : q3 = _AVX_ADD(q3, x3);
1009 :
1010 0 : tmp1 = _AVX_MUL(h2_imag, y1);
1011 : #ifdef __ELPA_USE_FMA__
1012 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1013 : #else
1014 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1015 : #endif
1016 0 : tmp2 = _AVX_MUL(h2_imag, y2);
1017 : #ifdef __FMA4_
1018 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1019 : #else
1020 0 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1021 : #endif
1022 0 : tmp3 = _AVX_MUL(h2_imag, y3);
1023 : #ifdef __ELPA_USE_FMA__
1024 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1025 : #else
1026 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1027 : #endif
1028 :
1029 0 : _AVX_STORE(&q_dbl[(ldq*2)+0], q1);
1030 0 : _AVX_STORE(&q_dbl[(ldq*2)+offset], q2);
1031 0 : _AVX_STORE(&q_dbl[(ldq*2)+2*offset], q3);
1032 :
1033 0 : for (i = 2; i < nb; i++)
1034 : {
1035 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1036 0 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
1037 0 : q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
1038 :
1039 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
1040 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
1041 :
1042 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1043 : #ifdef __ELPA_USE_FMA__
1044 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1045 : #else
1046 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1047 : #endif
1048 0 : tmp2 = _AVX_MUL(h1_imag, x2);
1049 : #ifdef __ELPA_USE_FMA__
1050 0 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1051 : #else
1052 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1053 : #endif
1054 0 : tmp3 = _AVX_MUL(h1_imag, x3);
1055 : #ifdef __ELPA_USE_FMA__
1056 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1057 : #else
1058 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1059 : #endif
1060 :
1061 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
1062 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1063 :
1064 0 : tmp1 = _AVX_MUL(h2_imag, y1);
1065 : #ifdef __ELPA_USE_FMA__
1066 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1067 : #else
1068 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1069 : #endif
1070 0 : tmp2 = _AVX_MUL(h2_imag, y2);
1071 : #ifdef __ELPA_USE_FMA__
1072 0 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1073 : #else
1074 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1075 : #endif
1076 0 : tmp3 = _AVX_MUL(h2_imag, y3);
1077 : #ifdef __ELPA_USE_FMA__
1078 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1079 : #else
1080 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1081 : #endif
1082 :
1083 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1084 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
1085 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
1086 : }
1087 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
1088 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1089 :
1090 0 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
1091 0 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1092 0 : q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
1093 :
1094 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1095 : #ifdef __ELPA_USE_FMA__
1096 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1097 : #else
1098 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1099 : #endif
1100 0 : tmp2 = _AVX_MUL(h1_imag, x2);
1101 : #ifdef __ELPA_USE_FMA__
1102 0 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1103 : #else
1104 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1105 : #endif
1106 0 : tmp3 = _AVX_MUL(h1_imag, x3);
1107 : #ifdef __ELPA_USE_FMA__
1108 0 : q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1109 : #else
1110 : q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1111 : #endif
1112 :
1113 0 : _AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1114 0 : _AVX_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
1115 0 : _AVX_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
1116 : }
1117 :
1118 : #ifdef DOUBLE_PRECISION_COMPLEX
1119 : static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
1120 : #endif
1121 : #ifdef SINGLE_PRECISION_COMPLEX
1122 : static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
1123 : #endif
1124 :
1125 : {
1126 : #ifdef DOUBLE_PRECISION_COMPLEX
1127 0 : double* q_dbl = (double*)q;
1128 0 : double* hh_dbl = (double*)hh;
1129 0 : double* s_dbl = (double*)(&s);
1130 : #endif
1131 : #ifdef SINGLE_PRECISION_COMPLEX
1132 654336 : float* q_dbl = (float*)q;
1133 654336 : float* hh_dbl = (float*)hh;
1134 654336 : float* s_dbl = (float*)(&s);
1135 : #endif
1136 : __AVX_DATATYPE x1, x2;
1137 : __AVX_DATATYPE y1, y2;
1138 : __AVX_DATATYPE q1, q2;
1139 : __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
1140 : __AVX_DATATYPE tmp1, tmp2;
1141 654336 : int i=0;
1142 :
1143 : #ifdef DOUBLE_PRECISION_COMPLEX
1144 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1145 : #endif
1146 : #ifdef SINGLE_PRECISION_COMPLEX
1147 654336 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1148 : #endif
1149 :
1150 1308672 : x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
1151 1308672 : x2 = _AVX_LOAD(&q_dbl[(2*ldq)+offset]);
1152 :
1153 1308672 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
1154 1308672 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1155 : #ifndef __ELPA_USE_FMA__
1156 : // conjugate
1157 : h2_imag = _AVX_XOR(h2_imag, sign);
1158 : #endif
1159 :
1160 654336 : y1 = _AVX_LOAD(&q_dbl[0]);
1161 1308672 : y2 = _AVX_LOAD(&q_dbl[offset]);
1162 :
1163 654336 : tmp1 = _AVX_MUL(h2_imag, x1);
1164 : #ifdef __ELPA_USE_FMA__
1165 1963008 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1166 : #else
1167 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1168 : #endif
1169 :
1170 654336 : tmp2 = _AVX_MUL(h2_imag, x2);
1171 : #ifdef __ELPA_USE_FMA__
1172 1963008 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1173 : #else
1174 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1175 : #endif
1176 :
1177 20284416 : for (i = 2; i < nb; i++)
1178 : {
1179 39260160 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1180 39260160 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
1181 :
1182 39260160 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
1183 39260160 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
1184 : #ifndef __ELPA_USE_FMA__
1185 : // conjugate
1186 : h1_imag = _AVX_XOR(h1_imag, sign);
1187 : #endif
1188 :
1189 19630080 : tmp1 = _AVX_MUL(h1_imag, q1);
1190 : #ifdef __ELPA_USE_FMA__
1191 58890240 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1192 : #else
1193 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1194 : #endif
1195 :
1196 19630080 : tmp2 = _AVX_MUL(h1_imag, q2);
1197 : #ifdef __ELPA_USE_FMA__
1198 58890240 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1199 : #else
1200 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1201 : #endif
1202 :
1203 39260160 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
1204 39260160 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1205 : #ifndef __ELPA_USE_FMA__
1206 : // conjugate
1207 : h2_imag = _AVX_XOR(h2_imag, sign);
1208 : #endif
1209 :
1210 19630080 : tmp1 = _AVX_MUL(h2_imag, q1);
1211 : #ifdef __ELPA_USE_FMA__
1212 58890240 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1213 : #else
1214 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1215 : #endif
1216 :
1217 19630080 : tmp2 = _AVX_MUL(h2_imag, q2);
1218 : #ifdef __ELPA_USE_FMA__
1219 58890240 : y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1220 : #else
1221 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1222 : #endif
1223 :
1224 : }
1225 :
1226 1308672 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
1227 1308672 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1228 : #ifndef __ELPA_USE_FMA__
1229 : // conjugate
1230 : h1_imag = _AVX_XOR(h1_imag, sign);
1231 : #endif
1232 :
1233 1308672 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
1234 1308672 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1235 :
1236 654336 : tmp1 = _AVX_MUL(h1_imag, q1);
1237 : #ifdef __ELPA_USE_FMA__
1238 1963008 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1239 : #else
1240 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1241 : #endif
1242 :
1243 654336 : tmp2 = _AVX_MUL(h1_imag, q2);
1244 : #ifdef __ELPA_USE_FMA__
1245 1963008 : x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1246 : #else
1247 : x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1248 : #endif
1249 :
1250 654336 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
1251 1308672 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
1252 654336 : h1_real = _AVX_XOR(h1_real, sign);
1253 654336 : h1_imag = _AVX_XOR(h1_imag, sign);
1254 :
1255 654336 : tmp1 = _AVX_MUL(h1_imag, x1);
1256 : #ifdef __ELPA_USE_FMA__
1257 1308672 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1258 : #else
1259 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1260 : #endif
1261 :
1262 654336 : tmp2 = _AVX_MUL(h1_imag, x2);
1263 : #ifdef __ELPA_USE_FMA__
1264 1308672 : x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1265 : #else
1266 : x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1267 : #endif
1268 :
1269 1308672 : h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
1270 1308672 : h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
1271 1308672 : h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
1272 1308672 : h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
1273 :
1274 654336 : h1_real = _AVX_XOR(h1_real, sign);
1275 654336 : h1_imag = _AVX_XOR(h1_imag, sign);
1276 654336 : h2_real = _AVX_XOR(h2_real, sign);
1277 654336 : h2_imag = _AVX_XOR(h2_imag, sign);
1278 :
1279 : #ifdef DOUBLE_PRECISION_COMPLEX
1280 0 : tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1281 : #endif
1282 : #ifdef SINGLE_PRECISION_COMPLEX
1283 2617344 : tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
1284 1308672 : s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1285 : #endif
1286 :
1287 1308672 : tmp1 = _AVX_MUL(h2_imag, tmp2);
1288 : #ifdef __ELPA_USE_FMA__
1289 1308672 : tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1290 : #else
1291 : tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1292 : #endif
1293 1308672 : h2_real = _AVX_SET1(tmp2[0]);
1294 1308672 : h2_imag = _AVX_SET1(tmp2[1]);
1295 :
1296 654336 : tmp1 = _AVX_MUL(h1_imag, y1);
1297 : #ifdef __ELPA_USE_FMA__
1298 1308672 : y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1299 : #else
1300 : y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1301 : #endif
1302 :
1303 654336 : tmp2 = _AVX_MUL(h1_imag, y2);
1304 : #ifdef __ELPA_USE_FMA__
1305 1308672 : y2 = _AVX_FMADDSUB(h1_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1306 : #else
1307 : y2 = _AVX_ADDSUB( _AVX_MUL(h1_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1308 : #endif
1309 :
1310 654336 : tmp1 = _AVX_MUL(h2_imag, x1);
1311 : #ifdef __ELPA_USE_FMA__
1312 1963008 : y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1313 : #else
1314 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1315 : #endif
1316 654336 : tmp2 = _AVX_MUL(h2_imag, x2);
1317 : #ifdef __ELPA_USE_FMA__
1318 1963008 : y2 = _AVX_ADD(y2, _AVX_FMADDSUB(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1319 : #else
1320 : y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1321 : #endif
1322 :
1323 654336 : q1 = _AVX_LOAD(&q_dbl[0]);
1324 1308672 : q2 = _AVX_LOAD(&q_dbl[offset]);
1325 :
1326 654336 : q1 = _AVX_ADD(q1, y1);
1327 654336 : q2 = _AVX_ADD(q2, y2);
1328 :
1329 : _AVX_STORE(&q_dbl[0], q1);
1330 654336 : _AVX_STORE(&q_dbl[offset], q2);
1331 :
1332 1308672 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
1333 1308672 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1334 :
1335 1308672 : q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
1336 1308672 : q2 = _AVX_LOAD(&q_dbl[(ldq*2)+offset]);
1337 :
1338 654336 : q1 = _AVX_ADD(q1, x1);
1339 654336 : q2 = _AVX_ADD(q2, x2);
1340 :
1341 654336 : tmp1 = _AVX_MUL(h2_imag, y1);
1342 : #ifdef __ELPA_USE_FMA__
1343 1963008 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1344 : #else
1345 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1346 : #endif
1347 :
1348 654336 : tmp2 = _AVX_MUL(h2_imag, y2);
1349 : #ifdef __FMA4_
1350 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1351 : #else
1352 2617344 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1353 : #endif
1354 :
1355 654336 : _AVX_STORE(&q_dbl[(ldq*2)+0], q1);
1356 654336 : _AVX_STORE(&q_dbl[(ldq*2)+offset], q2);
1357 :
1358 20284416 : for (i = 2; i < nb; i++)
1359 : {
1360 39260160 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1361 39260160 : q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
1362 :
1363 39260160 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
1364 39260160 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
1365 :
1366 19630080 : tmp1 = _AVX_MUL(h1_imag, x1);
1367 : #ifdef __ELPA_USE_FMA__
1368 58890240 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1369 : #else
1370 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1371 : #endif
1372 :
1373 19630080 : tmp2 = _AVX_MUL(h1_imag, x2);
1374 : #ifdef __ELPA_USE_FMA__
1375 58890240 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1376 : #else
1377 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1378 : #endif
1379 39260160 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
1380 39260160 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1381 :
1382 19630080 : tmp1 = _AVX_MUL(h2_imag, y1);
1383 : #ifdef __ELPA_USE_FMA__
1384 58890240 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1385 : #else
1386 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1387 : #endif
1388 :
1389 19630080 : tmp2 = _AVX_MUL(h2_imag, y2);
1390 : #ifdef __ELPA_USE_FMA__
1391 58890240 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1392 : #else
1393 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1394 : #endif
1395 :
1396 19630080 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1397 19630080 : _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
1398 : }
1399 1308672 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
1400 1308672 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1401 :
1402 1308672 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
1403 1308672 : q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1404 :
1405 654336 : tmp1 = _AVX_MUL(h1_imag, x1);
1406 : #ifdef __ELPA_USE_FMA__
1407 1963008 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1408 : #else
1409 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1410 : #endif
1411 :
1412 654336 : tmp2 = _AVX_MUL(h1_imag, x2);
1413 : #ifdef __ELPA_USE_FMA__
1414 1963008 : q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1415 : #else
1416 : q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1417 : #endif
1418 :
1419 654336 : _AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1420 654336 : _AVX_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
1421 : }
1422 :
1423 : #ifdef DOUBLE_PRECISION_COMPLEX
1424 : static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
1425 : #endif
1426 : #ifdef SINGLE_PRECISION_COMPLEX
1427 : static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
1428 : #endif
1429 :
1430 : {
1431 : #ifdef DOUBLE_PRECISION_COMPLEX
1432 0 : double* q_dbl = (double*)q;
1433 0 : double* hh_dbl = (double*)hh;
1434 0 : double* s_dbl = (double*)(&s);
1435 : #endif
1436 : #ifdef SINGLE_PRECISION_COMPLEX
1437 0 : float* q_dbl = (float*)q;
1438 0 : float* hh_dbl = (float*)hh;
1439 0 : float* s_dbl = (float*)(&s);
1440 : #endif
1441 : __AVX_DATATYPE x1;
1442 : __AVX_DATATYPE y1;
1443 : __AVX_DATATYPE q1;
1444 : __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
1445 : __AVX_DATATYPE tmp1;
1446 0 : int i=0;
1447 : #ifdef DOUBLE_PRECISION_COMPLEX
1448 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1449 : #endif
1450 : #ifdef SINGLE_PRECISION_COMPLEX
1451 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1452 : #endif
1453 0 : x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
1454 :
1455 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
1456 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1457 : #ifndef __ELPA_USE_FMA__
1458 : // conjugate
1459 : h2_imag = _AVX_XOR(h2_imag, sign);
1460 : #endif
1461 :
1462 0 : y1 = _AVX_LOAD(&q_dbl[0]);
1463 :
1464 0 : tmp1 = _AVX_MUL(h2_imag, x1);
1465 : #ifdef __ELPA_USE_FMA__
1466 0 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1467 : #else
1468 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1469 : #endif
1470 :
1471 0 : for (i = 2; i < nb; i++)
1472 : {
1473 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1474 :
1475 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
1476 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
1477 : #ifndef __ELPA_USE_FMA__
1478 : // conjugate
1479 : h1_imag = _AVX_XOR(h1_imag, sign);
1480 : #endif
1481 :
1482 0 : tmp1 = _AVX_MUL(h1_imag, q1);
1483 : #ifdef __ELPA_USE_FMA__
1484 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1485 : #else
1486 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1487 : #endif
1488 :
1489 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
1490 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1491 : #ifndef __ELPA_USE_FMA__
1492 : // conjugate
1493 : h2_imag = _AVX_XOR(h2_imag, sign);
1494 : #endif
1495 :
1496 0 : tmp1 = _AVX_MUL(h2_imag, q1);
1497 : #ifdef __ELPA_USE_FMA__
1498 0 : y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1499 : #else
1500 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1501 : #endif
1502 : }
1503 :
1504 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
1505 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1506 : #ifndef __ELPA_USE_FMA__
1507 : // conjugate
1508 : h1_imag = _AVX_XOR(h1_imag, sign);
1509 : #endif
1510 :
1511 0 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
1512 :
1513 0 : tmp1 = _AVX_MUL(h1_imag, q1);
1514 : #ifdef __ELPA_USE_FMA__
1515 0 : x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1516 : #else
1517 : x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1518 : #endif
1519 :
1520 0 : h1_real = _AVX_BROADCAST(&hh_dbl[0]);
1521 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
1522 0 : h1_real = _AVX_XOR(h1_real, sign);
1523 0 : h1_imag = _AVX_XOR(h1_imag, sign);
1524 :
1525 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1526 : #ifdef __ELPA_USE_FMA__
1527 0 : x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1528 : #else
1529 : x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1530 : #endif
1531 :
1532 0 : h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
1533 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
1534 0 : h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
1535 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
1536 :
1537 0 : h1_real = _AVX_XOR(h1_real, sign);
1538 0 : h1_imag = _AVX_XOR(h1_imag, sign);
1539 0 : h2_real = _AVX_XOR(h2_real, sign);
1540 0 : h2_imag = _AVX_XOR(h2_imag, sign);
1541 :
1542 : __AVX_DATATYPE tmp2;
1543 : #ifdef DOUBLE_PRECISION_COMPLEX
1544 0 : tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1545 : #endif
1546 : #ifdef SINGLE_PRECISION_COMPLEX
1547 0 : tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
1548 0 : s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1549 : #endif
1550 0 : tmp1 = _AVX_MUL(h2_imag, tmp2);
1551 : #ifdef __ELPA_USE_FMA__
1552 0 : tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1553 : #else
1554 : tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1555 : #endif
1556 0 : h2_real = _AVX_SET1(tmp2[0]);
1557 0 : h2_imag = _AVX_SET1(tmp2[1]);
1558 :
1559 0 : tmp1 = _AVX_MUL(h1_imag, y1);
1560 : #ifdef __ELPA_USE_FMA__
1561 0 : y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1562 : #else
1563 : y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1564 : #endif
1565 :
1566 0 : tmp1 = _AVX_MUL(h2_imag, x1);
1567 : #ifdef __ELPA_USE_FMA__
1568 0 : y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1569 : #else
1570 : y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1571 : #endif
1572 :
1573 0 : q1 = _AVX_LOAD(&q_dbl[0]);
1574 :
1575 0 : q1 = _AVX_ADD(q1, y1);
1576 :
1577 : _AVX_STORE(&q_dbl[0], q1);
1578 :
1579 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
1580 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1581 :
1582 0 : q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
1583 :
1584 0 : q1 = _AVX_ADD(q1, x1);
1585 :
1586 0 : tmp1 = _AVX_MUL(h2_imag, y1);
1587 : #ifdef __ELPA_USE_FMA__
1588 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1589 : #else
1590 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1591 : #endif
1592 :
1593 0 : _AVX_STORE(&q_dbl[(ldq*2)+0], q1);
1594 :
1595 0 : for (i = 2; i < nb; i++)
1596 : {
1597 0 : q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
1598 :
1599 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
1600 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
1601 :
1602 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1603 : #ifdef __ELPA_USE_FMA__
1604 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1605 : #else
1606 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1607 : #endif
1608 :
1609 0 : h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
1610 0 : h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1611 :
1612 0 : tmp1 = _AVX_MUL(h2_imag, y1);
1613 : #ifdef __ELPA_USE_FMA__
1614 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1615 : #else
1616 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1617 : #endif
1618 :
1619 0 : _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
1620 : }
1621 0 : h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
1622 0 : h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1623 :
1624 0 : q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
1625 :
1626 0 : tmp1 = _AVX_MUL(h1_imag, x1);
1627 : #ifdef __ELPA_USE_FMA__
1628 0 : q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1629 : #else
1630 : q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1631 : #endif
1632 :
1633 0 : _AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1634 : }
|