Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46 : // --------------------------------------------------------------------------------------------------
47 : #include "config-f90.h"
48 :
49 : #include <complex.h>
50 : #include <x86intrin.h>
51 : #include <stdio.h>
52 : #include <stdlib.h>
53 :
54 : #define __forceinline __attribute__((always_inline))
55 :
56 : #ifdef DOUBLE_PRECISION_COMPLEX
57 : #define offset 8
58 :
59 : #define __AVX512_DATATYPE __m512d
60 : #define _AVX512_LOAD _mm512_load_pd
61 : #define _AVX512_STORE _mm512_store_pd
62 : #define _AVX512_SET1 _mm512_set1_pd
63 : #define _AVX512_SET _mm512_set_pd
64 : #define _AVX512_MUL _mm512_mul_pd
65 : #define _AVX512_ADD _mm512_add_pd
66 : #define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
67 : #define _AVX512_SHUFFLE _mm512_shuffle_pd
68 : #define _SHUFFLE 0x55
69 :
70 : #ifdef HAVE_AVX512
71 :
72 : #define __ELPA_USE_FMA__
73 : #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
74 : #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
75 : #endif
76 :
77 : #define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
78 : #define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
79 : #endif /* DOUBLE_PRECISION_COMPLEX */
80 :
81 : #ifdef SINGLE_PRECISION_COMPLEX
82 : #define offset 16
83 :
84 : #define __AVX512_DATATYPE __m512
85 : #define _AVX512_LOAD _mm512_load_ps
86 : #define _AVX512_STORE _mm512_store_ps
87 : #define _AVX512_SET1 _mm512_set1_ps
88 : #define _AVX512_SET _mm512_set_ps
89 : #define _AVX512_MUL _mm512_mul_ps
90 : #define _AVX512_ADD _mm512_add_ps
91 : #define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
92 : #define _AVX512_SHUFFLE _mm512_shuffle_ps
93 : #define _SHUFFLE 0xb1
94 :
95 : #ifdef HAVE_AVX512
96 :
97 : #define __ELPA_USE_FMA__
98 : #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
99 : #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
100 : #endif
101 :
102 : #define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
103 : #define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
104 : #endif /* SINGLE_PRECISION_COMPLEX */
105 :
106 : //Forward declaration
107 : #ifdef DOUBLE_PRECISION_COMPLEX
108 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
109 : static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
110 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
111 : static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
112 : #endif
113 :
114 : #ifdef SINGLE_PRECISION_COMPLEX
115 : static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
116 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
117 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
118 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
119 : #endif
120 :
121 : /*
122 : !f>#if defined(HAVE_AVX512)
123 : !f> interface
124 : !f> subroutine double_hh_trafo_complex_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
125 : !f> bind(C, name="double_hh_trafo_complex_avx512_2hv_double")
126 : !f> use, intrinsic :: iso_c_binding
127 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
128 : !f> ! complex(kind=c_double_complex) :: q(*)
129 : !f> type(c_ptr), value :: q
130 : !f> complex(kind=c_double_complex) :: hh(pnb,2)
131 : !f> end subroutine
132 : !f> end interface
133 : !f>#endif
134 : */
135 :
136 : /*
137 : !f>#if defined(HAVE_AVX512)
138 : !f> interface
139 : !f> subroutine double_hh_trafo_complex_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
140 : !f> bind(C, name="double_hh_trafo_complex_avx512_2hv_single")
141 : !f> use, intrinsic :: iso_c_binding
142 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
143 : !f> ! complex(kind=c_float_complex) :: q(*)
144 : !f> type(c_ptr), value :: q
145 : !f> complex(kind=c_float_complex) :: hh(pnb,2)
146 : !f> end subroutine
147 : !f> end interface
148 : !f>#endif
149 : */
150 :
151 : #ifdef DOUBLE_PRECISION_COMPLEX
152 327168 : void double_hh_trafo_complex_avx512_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
153 : #endif
154 : #ifdef SINGLE_PRECISION_COMPLEX
155 163584 : void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
156 : #endif
157 : {
158 : int i;
159 490752 : int nb = *pnb;
160 490752 : int nq = *pldq;
161 490752 : int ldq = *pldq;
162 490752 : int ldh = *pldh;
163 : int worked_on;
164 :
165 490752 : worked_on = 0;
166 :
167 : #ifdef DOUBLE_PRECISION_COMPLEX
168 327168 : double complex s = conj(hh[(ldh)+1])*1.0;
169 : #endif
170 : #ifdef SINGLE_PRECISION_COMPLEX
171 163584 : float complex s = conj(hh[(ldh)+1])*1.0f;
172 : #endif
173 15213312 : for (i = 2; i < nb; i++)
174 : {
175 14722560 : s += hh[i-1] * conj(hh[(i+ldh)]);
176 : }
177 :
178 : #ifdef DOUBLE_PRECISION_COMPLEX
179 981504 : for (i = 0; i < nq-12; i+=16)
180 : {
181 654336 : hh_trafo_complex_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
182 654336 : worked_on += 16;
183 : }
184 : #endif
185 : #ifdef SINGLE_PRECISION_COMPLEX
186 327168 : for (i = 0; i < nq-24; i+=32)
187 : {
188 163584 : hh_trafo_complex_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
189 163584 : worked_on += 32;
190 :
191 : }
192 : #endif
193 490752 : if (nq-i == 0) {
194 0 : return;
195 : }
196 : #ifdef DOUBLE_PRECISION_COMPLEX
197 327168 : if (nq-i == 12 )
198 : {
199 0 : hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
200 0 : worked_on += 12;
201 : }
202 : #endif
203 : #ifdef SINGLE_PRECISION_COMPLEX
204 163584 : if (nq-i == 24 )
205 : {
206 0 : hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
207 0 : worked_on += 24;
208 : }
209 : #endif
210 : #ifdef DOUBLE_PRECISION_COMPLEX
211 327168 : if (nq-i == 8 )
212 : {
213 327168 : hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
214 327168 : worked_on += 8;
215 : }
216 : #endif
217 : #ifdef SINGLE_PRECISION_COMPLEX
218 163584 : if (nq-i == 16 )
219 : {
220 81792 : hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
221 81792 : worked_on += 16;
222 : }
223 : #endif
224 :
225 : #ifdef DOUBLE_PRECISION_COMPLEX
226 327168 : if (nq-i == 4 ) {
227 :
228 0 : hh_trafo_complex_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
229 0 : worked_on += 4;
230 : }
231 : #endif
232 :
233 : #ifdef SINGLE_PRECISION_COMPLEX
234 163584 : if (nq-i == 8 ) {
235 :
236 81792 : hh_trafo_complex_kernel_8_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
237 81792 : worked_on += 8;
238 : }
239 : #endif
240 : #ifdef WITH_DEBUG
241 : if (worked_on != nq)
242 : {
243 : printf("Error in complex AVX512 BLOCK 2 kernel \n");
244 : abort();
245 : }
246 : #endif
247 : }
248 :
249 : #ifdef DOUBLE_PRECISION_COMPLEX
250 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
251 : #endif
252 : #ifdef SINGLE_PRECISION_COMPLEX
253 : static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
254 : #endif
255 : {
256 :
257 : #ifdef DOUBLE_PRECISION_COMPLEX
258 654336 : double* q_dbl = (double*)q;
259 654336 : double* hh_dbl = (double*)hh;
260 654336 : double* s_dbl = (double*)(&s);
261 : #endif
262 : #ifdef SINGLE_PRECISION_COMPLEX
263 163584 : float* q_dbl = (float*)q;
264 163584 : float* hh_dbl = (float*)hh;
265 163584 : float* s_dbl = (float*)(&s);
266 : #endif
267 : __AVX512_DATATYPE x1, x2, x3, x4;
268 : __AVX512_DATATYPE y1, y2, y3, y4;
269 : __AVX512_DATATYPE q1, q2, q3, q4;
270 : __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
271 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
272 817920 : int i=0;
273 :
274 : #ifdef DOUBLE_PRECISION_COMPLEX
275 654336 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
276 : #endif
277 : #ifdef SINGLE_PRECISION_COMPLEX
278 163584 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
279 : #endif
280 :
281 1635840 : x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]); // q1, q2, q3, q4
282 1635840 : x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]); // q5, q6, q7, q8
283 1635840 : x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
284 1635840 : x4 = _AVX512_LOAD(&q_dbl[(2*ldq)+3*offset]); // q13, q14, q15, q16
285 :
286 1635840 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
287 1635840 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
288 :
289 817920 : y1 = _AVX512_LOAD(&q_dbl[0]);
290 1635840 : y2 = _AVX512_LOAD(&q_dbl[offset]);
291 1635840 : y3 = _AVX512_LOAD(&q_dbl[2*offset]);
292 1635840 : y4 = _AVX512_LOAD(&q_dbl[3*offset]);
293 :
294 817920 : tmp1 = _AVX512_MUL(h2_imag, x1);
295 :
296 2453760 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
297 :
298 817920 : tmp2 = _AVX512_MUL(h2_imag, x2);
299 :
300 2453760 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
301 :
302 817920 : tmp3 = _AVX512_MUL(h2_imag, x3);
303 :
304 2453760 : y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
305 :
306 817920 : tmp4 = _AVX512_MUL(h2_imag, x4);
307 :
308 2453760 : y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
309 :
310 25355520 : for (i = 2; i < nb; i++)
311 : {
312 49075200 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
313 49075200 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
314 49075200 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
315 49075200 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
316 :
317 49075200 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
318 49075200 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
319 :
320 24537600 : tmp1 = _AVX512_MUL(h1_imag, q1);
321 :
322 73612800 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
323 :
324 24537600 : tmp2 = _AVX512_MUL(h1_imag, q2);
325 :
326 73612800 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
327 :
328 24537600 : tmp3 = _AVX512_MUL(h1_imag, q3);
329 :
330 73612800 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
331 :
332 24537600 : tmp4 = _AVX512_MUL(h1_imag, q4);
333 :
334 73612800 : x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
335 :
336 49075200 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
337 49075200 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
338 :
339 24537600 : tmp1 = _AVX512_MUL(h2_imag, q1);
340 :
341 73612800 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
342 :
343 24537600 : tmp2 = _AVX512_MUL(h2_imag, q2);
344 :
345 73612800 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
346 :
347 24537600 : tmp3 = _AVX512_MUL(h2_imag, q3);
348 :
349 73612800 : y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
350 :
351 24537600 : tmp4 = _AVX512_MUL(h2_imag, q4);
352 :
353 73612800 : y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
354 : }
355 :
356 1635840 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
357 1635840 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
358 :
359 1635840 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
360 1635840 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
361 1635840 : q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
362 1635840 : q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
363 :
364 817920 : tmp1 = _AVX512_MUL(h1_imag, q1);
365 :
366 2453760 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
367 :
368 817920 : tmp2 = _AVX512_MUL(h1_imag, q2);
369 :
370 2453760 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
371 :
372 817920 : tmp3 = _AVX512_MUL(h1_imag, q3);
373 :
374 2453760 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
375 :
376 817920 : tmp4 = _AVX512_MUL(h1_imag, q4);
377 :
378 2453760 : x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
379 :
380 1635840 : h1_real = _AVX512_SET1(hh_dbl[0]);
381 1635840 : h1_imag = _AVX512_SET1(hh_dbl[1]);
382 :
383 : #ifdef DOUBLE_PRECISION_COMPLEX
384 1308672 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
385 1308672 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
386 : #endif
387 : #ifdef SINGLE_PRECISION_COMPLEX
388 327168 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
389 327168 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
390 : #endif
391 817920 : tmp1 = _AVX512_MUL(h1_imag, x1);
392 :
393 1635840 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
394 :
395 817920 : tmp2 = _AVX512_MUL(h1_imag, x2);
396 :
397 1635840 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
398 :
399 817920 : tmp3 = _AVX512_MUL(h1_imag, x3);
400 :
401 1635840 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
402 :
403 817920 : tmp4 = _AVX512_MUL(h1_imag, x4);
404 :
405 1635840 : x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
406 :
407 1635840 : h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
408 1635840 : h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
409 1635840 : h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
410 1635840 : h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
411 :
412 : #ifdef DOUBLE_PRECISION_COMPLEX
413 1308672 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
414 1308672 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
415 : #endif
416 : #ifdef SINGLE_PRECISION_COMPLEX
417 327168 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
418 327168 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
419 : #endif
420 :
421 : #ifdef DOUBLE_PRECISION_COMPLEX
422 1308672 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
423 1308672 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
424 : #endif
425 : #ifdef SINGLE_PRECISION_COMPLEX
426 327168 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
427 327168 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
428 : #endif
429 :
430 : #ifdef DOUBLE_PRECISION_COMPLEX
431 3271680 : tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
432 654336 : s_dbl[1], s_dbl[0],
433 654336 : s_dbl[1], s_dbl[0],
434 654336 : s_dbl[1], s_dbl[0]);
435 : #endif
436 : #ifdef SINGLE_PRECISION_COMPLEX
437 327168 : tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
438 : #endif
439 817920 : tmp1 = _AVX512_MUL(h2_imag, tmp2);
440 :
441 1635840 : tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
442 :
443 : _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
444 :
445 1635840 : h2_real = _AVX512_SET1(s_dbl[0]);
446 1635840 : h2_imag = _AVX512_SET1(s_dbl[1]);
447 :
448 817920 : tmp1 = _AVX512_MUL(h1_imag, y1);
449 :
450 1635840 : y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
451 :
452 817920 : tmp2 = _AVX512_MUL(h1_imag, y2);
453 :
454 1635840 : y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
455 :
456 817920 : tmp3 = _AVX512_MUL(h1_imag, y3);
457 :
458 1635840 : y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
459 :
460 817920 : tmp4 = _AVX512_MUL(h1_imag, y4);
461 :
462 1635840 : y4 = _AVX512_FMADDSUB(h1_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
463 :
464 817920 : tmp1 = _AVX512_MUL(h2_imag, x1);
465 :
466 2453760 : y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
467 :
468 817920 : tmp2 = _AVX512_MUL(h2_imag, x2);
469 :
470 2453760 : y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
471 :
472 817920 : tmp3 = _AVX512_MUL(h2_imag, x3);
473 :
474 2453760 : y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
475 :
476 817920 : tmp4 = _AVX512_MUL(h2_imag, x4);
477 :
478 2453760 : y4 = _AVX512_ADD(y4, _AVX512_FMADDSUB(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
479 :
480 817920 : q1 = _AVX512_LOAD(&q_dbl[0]);
481 1635840 : q2 = _AVX512_LOAD(&q_dbl[offset]);
482 1635840 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
483 1635840 : q4 = _AVX512_LOAD(&q_dbl[3*offset]);
484 :
485 817920 : q1 = _AVX512_ADD(q1, y1);
486 817920 : q2 = _AVX512_ADD(q2, y2);
487 817920 : q3 = _AVX512_ADD(q3, y3);
488 817920 : q4 = _AVX512_ADD(q4, y4);
489 :
490 : _AVX512_STORE(&q_dbl[0], q1);
491 817920 : _AVX512_STORE(&q_dbl[offset], q2);
492 817920 : _AVX512_STORE(&q_dbl[2*offset], q3);
493 817920 : _AVX512_STORE(&q_dbl[3*offset], q4);
494 :
495 1635840 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
496 1635840 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
497 :
498 1635840 : q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
499 1635840 : q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
500 1635840 : q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
501 1635840 : q4 = _AVX512_LOAD(&q_dbl[(ldq*2)+3*offset]);
502 :
503 817920 : q1 = _AVX512_ADD(q1, x1);
504 817920 : q2 = _AVX512_ADD(q2, x2);
505 817920 : q3 = _AVX512_ADD(q3, x3);
506 817920 : q4 = _AVX512_ADD(q4, x4);
507 :
508 817920 : tmp1 = _AVX512_MUL(h2_imag, y1);
509 :
510 2453760 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
511 :
512 817920 : tmp2 = _AVX512_MUL(h2_imag, y2);
513 :
514 2453760 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
515 :
516 817920 : tmp3 = _AVX512_MUL(h2_imag, y3);
517 :
518 2453760 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
519 :
520 817920 : tmp4 = _AVX512_MUL(h2_imag, y4);
521 :
522 2453760 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
523 :
524 817920 : _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
525 817920 : _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
526 817920 : _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
527 817920 : _AVX512_STORE(&q_dbl[(ldq*2)+3*offset], q4);
528 :
529 25355520 : for (i = 2; i < nb; i++)
530 : {
531 49075200 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
532 49075200 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
533 49075200 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
534 49075200 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
535 :
536 49075200 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
537 49075200 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
538 :
539 24537600 : tmp1 = _AVX512_MUL(h1_imag, x1);
540 :
541 73612800 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
542 :
543 24537600 : tmp2 = _AVX512_MUL(h1_imag, x2);
544 :
545 73612800 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
546 :
547 24537600 : tmp3 = _AVX512_MUL(h1_imag, x3);
548 :
549 73612800 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
550 :
551 24537600 : tmp4 = _AVX512_MUL(h1_imag, x4);
552 :
553 73612800 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
554 :
555 49075200 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
556 49075200 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
557 :
558 24537600 : tmp1 = _AVX512_MUL(h2_imag, y1);
559 :
560 73612800 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
561 :
562 24537600 : tmp2 = _AVX512_MUL(h2_imag, y2);
563 :
564 73612800 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
565 :
566 24537600 : tmp3 = _AVX512_MUL(h2_imag, y3);
567 :
568 73612800 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
569 :
570 24537600 : tmp4 = _AVX512_MUL(h2_imag, y4);
571 :
572 73612800 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
573 :
574 24537600 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
575 24537600 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
576 24537600 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
577 24537600 : _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
578 : }
579 :
580 1635840 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
581 1635840 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
582 :
583 1635840 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
584 1635840 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
585 1635840 : q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
586 1635840 : q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
587 :
588 817920 : tmp1 = _AVX512_MUL(h1_imag, x1);
589 :
590 2453760 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
591 :
592 817920 : tmp2 = _AVX512_MUL(h1_imag, x2);
593 :
594 2453760 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
595 :
596 817920 : tmp3 = _AVX512_MUL(h1_imag, x3);
597 :
598 2453760 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
599 :
600 817920 : tmp4 = _AVX512_MUL(h1_imag, x4);
601 :
602 2453760 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
603 :
604 817920 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
605 817920 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
606 817920 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
607 817920 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
608 : }
609 :
610 :
611 : #ifdef DOUBLE_PRECISION_COMPLEX
612 : static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
613 : #endif
614 : #ifdef SINGLE_PRECISION_COMPLEX
615 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
616 : #endif
617 : {
618 :
619 : #ifdef DOUBLE_PRECISION_COMPLEX
620 0 : double* q_dbl = (double*)q;
621 0 : double* hh_dbl = (double*)hh;
622 0 : double* s_dbl = (double*)(&s);
623 : #endif
624 : #ifdef SINGLE_PRECISION_COMPLEX
625 0 : float* q_dbl = (float*)q;
626 0 : float* hh_dbl = (float*)hh;
627 0 : float* s_dbl = (float*)(&s);
628 : #endif
629 : __AVX512_DATATYPE x1, x2, x3, x4;
630 : __AVX512_DATATYPE y1, y2, y3, y4;
631 : __AVX512_DATATYPE q1, q2, q3, q4;
632 : __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
633 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
634 0 : int i=0;
635 :
636 : #ifdef DOUBLE_PRECISION_COMPLEX
637 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
638 : #endif
639 : #ifdef SINGLE_PRECISION_COMPLEX
640 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
641 : #endif
642 :
643 0 : x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]); // q1, q2, q3, q4
644 0 : x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]); // q5, q6, q7, q8
645 0 : x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
646 :
647 0 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
648 0 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
649 :
650 0 : y1 = _AVX512_LOAD(&q_dbl[0]);
651 0 : y2 = _AVX512_LOAD(&q_dbl[offset]);
652 0 : y3 = _AVX512_LOAD(&q_dbl[2*offset]);
653 :
654 0 : tmp1 = _AVX512_MUL(h2_imag, x1);
655 :
656 0 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
657 :
658 0 : tmp2 = _AVX512_MUL(h2_imag, x2);
659 :
660 0 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
661 :
662 0 : tmp3 = _AVX512_MUL(h2_imag, x3);
663 :
664 0 : y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
665 :
666 0 : for (i = 2; i < nb; i++)
667 : {
668 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
669 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
670 0 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
671 :
672 0 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
673 0 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
674 :
675 0 : tmp1 = _AVX512_MUL(h1_imag, q1);
676 :
677 0 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
678 :
679 0 : tmp2 = _AVX512_MUL(h1_imag, q2);
680 :
681 0 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
682 :
683 0 : tmp3 = _AVX512_MUL(h1_imag, q3);
684 :
685 0 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
686 :
687 0 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
688 0 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
689 :
690 0 : tmp1 = _AVX512_MUL(h2_imag, q1);
691 :
692 0 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
693 :
694 0 : tmp2 = _AVX512_MUL(h2_imag, q2);
695 :
696 0 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
697 :
698 0 : tmp3 = _AVX512_MUL(h2_imag, q3);
699 :
700 0 : y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
701 :
702 : }
703 :
704 0 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
705 0 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
706 :
707 0 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
708 0 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
709 0 : q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
710 :
711 0 : tmp1 = _AVX512_MUL(h1_imag, q1);
712 :
713 0 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
714 :
715 0 : tmp2 = _AVX512_MUL(h1_imag, q2);
716 :
717 0 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
718 :
719 0 : tmp3 = _AVX512_MUL(h1_imag, q3);
720 :
721 0 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
722 :
723 0 : h1_real = _AVX512_SET1(hh_dbl[0]);
724 0 : h1_imag = _AVX512_SET1(hh_dbl[1]);
725 :
726 : #ifdef DOUBLE_PRECISION_COMPLEX
727 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
728 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
729 : #endif
730 : #ifdef SINGLE_PRECISION_COMPLEX
731 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
732 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
733 : #endif
734 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
735 :
736 0 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
737 :
738 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
739 :
740 0 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
741 :
742 0 : tmp3 = _AVX512_MUL(h1_imag, x3);
743 :
744 0 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
745 :
746 0 : h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
747 0 : h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
748 0 : h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
749 0 : h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
750 :
751 : #ifdef DOUBLE_PRECISION_COMPLEX
752 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
753 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
754 : #endif
755 : #ifdef SINGLE_PRECISION_COMPLEX
756 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
757 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
758 : #endif
759 :
760 : #ifdef DOUBLE_PRECISION_COMPLEX
761 0 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
762 0 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
763 : #endif
764 : #ifdef SINGLE_PRECISION_COMPLEX
765 0 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
766 0 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
767 : #endif
768 :
769 : #ifdef DOUBLE_PRECISION_COMPLEX
770 0 : tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
771 0 : s_dbl[1], s_dbl[0],
772 0 : s_dbl[1], s_dbl[0],
773 0 : s_dbl[1], s_dbl[0]);
774 : #endif
775 : #ifdef SINGLE_PRECISION_COMPLEX
776 0 : tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
777 : #endif
778 0 : tmp1 = _AVX512_MUL(h2_imag, tmp2);
779 :
780 0 : tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
781 :
782 : _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
783 :
784 0 : h2_real = _AVX512_SET1(s_dbl[0]);
785 0 : h2_imag = _AVX512_SET1(s_dbl[1]);
786 :
787 0 : tmp1 = _AVX512_MUL(h1_imag, y1);
788 :
789 0 : y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
790 :
791 0 : tmp2 = _AVX512_MUL(h1_imag, y2);
792 :
793 0 : y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
794 :
795 0 : tmp3 = _AVX512_MUL(h1_imag, y3);
796 :
797 0 : y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
798 :
799 0 : tmp1 = _AVX512_MUL(h2_imag, x1);
800 :
801 0 : y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
802 :
803 0 : tmp2 = _AVX512_MUL(h2_imag, x2);
804 :
805 0 : y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
806 :
807 0 : tmp3 = _AVX512_MUL(h2_imag, x3);
808 :
809 0 : y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
810 :
811 0 : q1 = _AVX512_LOAD(&q_dbl[0]);
812 0 : q2 = _AVX512_LOAD(&q_dbl[offset]);
813 0 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
814 :
815 0 : q1 = _AVX512_ADD(q1, y1);
816 0 : q2 = _AVX512_ADD(q2, y2);
817 0 : q3 = _AVX512_ADD(q3, y3);
818 :
819 : _AVX512_STORE(&q_dbl[0], q1);
820 0 : _AVX512_STORE(&q_dbl[offset], q2);
821 0 : _AVX512_STORE(&q_dbl[2*offset], q3);
822 :
823 0 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
824 0 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
825 :
826 0 : q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
827 0 : q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
828 0 : q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
829 :
830 0 : q1 = _AVX512_ADD(q1, x1);
831 0 : q2 = _AVX512_ADD(q2, x2);
832 0 : q3 = _AVX512_ADD(q3, x3);
833 :
834 0 : tmp1 = _AVX512_MUL(h2_imag, y1);
835 :
836 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
837 :
838 0 : tmp2 = _AVX512_MUL(h2_imag, y2);
839 :
840 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
841 :
842 0 : tmp3 = _AVX512_MUL(h2_imag, y3);
843 :
844 0 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
845 :
846 0 : _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
847 0 : _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
848 0 : _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
849 :
850 0 : for (i = 2; i < nb; i++)
851 : {
852 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
853 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
854 0 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
855 :
856 0 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
857 0 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
858 :
859 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
860 :
861 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
862 :
863 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
864 :
865 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
866 :
867 0 : tmp3 = _AVX512_MUL(h1_imag, x3);
868 :
869 0 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
870 :
871 0 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
872 0 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
873 :
874 0 : tmp1 = _AVX512_MUL(h2_imag, y1);
875 :
876 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
877 :
878 0 : tmp2 = _AVX512_MUL(h2_imag, y2);
879 :
880 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
881 :
882 0 : tmp3 = _AVX512_MUL(h2_imag, y3);
883 :
884 0 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
885 :
886 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
887 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
888 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
889 : }
890 :
891 0 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
892 0 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
893 :
894 0 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
895 0 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
896 0 : q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
897 :
898 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
899 :
900 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
901 :
902 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
903 :
904 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
905 :
906 0 : tmp3 = _AVX512_MUL(h1_imag, x3);
907 :
908 0 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
909 :
910 0 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
911 0 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
912 0 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
913 : }
914 :
915 :
916 : #ifdef DOUBLE_PRECISION_COMPLEX
917 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
918 : #endif
919 : #ifdef SINGLE_PRECISION_COMPLEX
920 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
921 : #endif
922 : {
923 :
924 : #ifdef DOUBLE_PRECISION_COMPLEX
925 327168 : double* q_dbl = (double*)q;
926 327168 : double* hh_dbl = (double*)hh;
927 327168 : double* s_dbl = (double*)(&s);
928 : #endif
929 : #ifdef SINGLE_PRECISION_COMPLEX
930 81792 : float* q_dbl = (float*)q;
931 81792 : float* hh_dbl = (float*)hh;
932 81792 : float* s_dbl = (float*)(&s);
933 : #endif
934 :
935 : __AVX512_DATATYPE x1, x2;
936 : __AVX512_DATATYPE y1, y2;
937 : __AVX512_DATATYPE q1, q2;
938 : __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
939 : __AVX512_DATATYPE tmp1, tmp2;
940 408960 : int i=0;
941 :
942 : #ifdef DOUBLE_PRECISION_COMPLEX
943 327168 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
944 : #endif
945 : #ifdef SINGLE_PRECISION_COMPLEX
946 81792 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
947 : #endif
948 :
949 817920 : x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
950 817920 : x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);
951 :
952 817920 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
953 817920 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
954 :
955 408960 : y1 = _AVX512_LOAD(&q_dbl[0]);
956 817920 : y2 = _AVX512_LOAD(&q_dbl[offset]);
957 :
958 408960 : tmp1 = _AVX512_MUL(h2_imag, x1);
959 :
960 1226880 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
961 :
962 408960 : tmp2 = _AVX512_MUL(h2_imag, x2);
963 :
964 1226880 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
965 :
966 12677760 : for (i = 2; i < nb; i++)
967 : {
968 24537600 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
969 24537600 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
970 :
971 24537600 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
972 24537600 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
973 :
974 12268800 : tmp1 = _AVX512_MUL(h1_imag, q1);
975 :
976 36806400 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
977 :
978 12268800 : tmp2 = _AVX512_MUL(h1_imag, q2);
979 :
980 36806400 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
981 :
982 24537600 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
983 24537600 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
984 :
985 12268800 : tmp1 = _AVX512_MUL(h2_imag, q1);
986 :
987 36806400 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
988 :
989 12268800 : tmp2 = _AVX512_MUL(h2_imag, q2);
990 :
991 36806400 : y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
992 : }
993 :
994 817920 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
995 817920 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
996 :
997 817920 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
998 817920 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
999 :
1000 408960 : tmp1 = _AVX512_MUL(h1_imag, q1);
1001 :
1002 1226880 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1003 :
1004 408960 : tmp2 = _AVX512_MUL(h1_imag, q2);
1005 :
1006 1226880 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1007 :
1008 817920 : h1_real = _AVX512_SET1(hh_dbl[0]);
1009 817920 : h1_imag = _AVX512_SET1(hh_dbl[1]);
1010 :
1011 : #ifdef DOUBLE_PRECISION_COMPLEX
1012 654336 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1013 654336 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1014 : #endif
1015 : #ifdef SINGLE_PRECISION_COMPLEX
1016 163584 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1017 163584 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1018 : #endif
1019 :
1020 :
1021 408960 : tmp1 = _AVX512_MUL(h1_imag, x1);
1022 :
1023 817920 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1024 :
1025 408960 : tmp2 = _AVX512_MUL(h1_imag, x2);
1026 :
1027 817920 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1028 :
1029 817920 : h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
1030 817920 : h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
1031 817920 : h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
1032 817920 : h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
1033 :
1034 : #ifdef DOUBLE_PRECISION_COMPLEX
1035 654336 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1036 654336 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1037 : #endif
1038 : #ifdef SINGLE_PRECISION_COMPLEX
1039 163584 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1040 163584 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1041 : #endif
1042 :
1043 : #ifdef DOUBLE_PRECISION_COMPLEX
1044 654336 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
1045 654336 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
1046 : #endif
1047 : #ifdef SINGLE_PRECISION_COMPLEX
1048 163584 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
1049 163584 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
1050 : #endif
1051 :
1052 : #ifdef DOUBLE_PRECISION_COMPLEX
1053 1635840 : tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
1054 327168 : s_dbl[1], s_dbl[0],
1055 327168 : s_dbl[1], s_dbl[0],
1056 327168 : s_dbl[1], s_dbl[0]);
1057 : #endif
1058 : #ifdef SINGLE_PRECISION_COMPLEX
1059 163584 : tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
1060 : #endif
1061 :
1062 408960 : tmp1 = _AVX512_MUL(h2_imag, tmp2);
1063 :
1064 817920 : tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1065 :
1066 : _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
1067 :
1068 817920 : h2_real = _AVX512_SET1(s_dbl[0]);
1069 817920 : h2_imag = _AVX512_SET1(s_dbl[1]);
1070 :
1071 408960 : tmp1 = _AVX512_MUL(h1_imag, y1);
1072 :
1073 817920 : y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1074 :
1075 408960 : tmp2 = _AVX512_MUL(h1_imag, y2);
1076 :
1077 817920 : y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1078 :
1079 408960 : tmp1 = _AVX512_MUL(h2_imag, x1);
1080 :
1081 1226880 : y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1082 :
1083 408960 : tmp2 = _AVX512_MUL(h2_imag, x2);
1084 :
1085 1226880 : y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1086 :
1087 408960 : q1 = _AVX512_LOAD(&q_dbl[0]);
1088 817920 : q2 = _AVX512_LOAD(&q_dbl[offset]);
1089 :
1090 408960 : q1 = _AVX512_ADD(q1, y1);
1091 408960 : q2 = _AVX512_ADD(q2, y2);
1092 :
1093 : _AVX512_STORE(&q_dbl[0], q1);
1094 408960 : _AVX512_STORE(&q_dbl[offset], q2);
1095 :
1096 817920 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
1097 817920 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
1098 :
1099 817920 : q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
1100 817920 : q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
1101 :
1102 408960 : q1 = _AVX512_ADD(q1, x1);
1103 408960 : q2 = _AVX512_ADD(q2, x2);
1104 :
1105 408960 : tmp1 = _AVX512_MUL(h2_imag, y1);
1106 :
1107 1226880 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1108 :
1109 408960 : tmp2 = _AVX512_MUL(h2_imag, y2);
1110 :
1111 1226880 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1112 :
1113 408960 : _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
1114 408960 : _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
1115 :
1116 12677760 : for (i = 2; i < nb; i++)
1117 : {
1118 24537600 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
1119 24537600 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
1120 :
1121 24537600 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
1122 24537600 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
1123 :
1124 12268800 : tmp1 = _AVX512_MUL(h1_imag, x1);
1125 :
1126 36806400 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1127 :
1128 12268800 : tmp2 = _AVX512_MUL(h1_imag, x2);
1129 :
1130 36806400 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1131 :
1132 24537600 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
1133 24537600 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
1134 :
1135 12268800 : tmp1 = _AVX512_MUL(h2_imag, y1);
1136 :
1137 36806400 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1138 :
1139 12268800 : tmp2 = _AVX512_MUL(h2_imag, y2);
1140 :
1141 36806400 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1142 :
1143 12268800 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
1144 12268800 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
1145 : }
1146 817920 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
1147 817920 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
1148 :
1149 817920 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
1150 817920 : q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1151 :
1152 408960 : tmp1 = _AVX512_MUL(h1_imag, x1);
1153 :
1154 1226880 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1155 :
1156 408960 : tmp2 = _AVX512_MUL(h1_imag, x2);
1157 :
1158 1226880 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1159 :
1160 408960 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1161 408960 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
1162 : }
1163 :
1164 : #ifdef DOUBLE_PRECISION_COMPLEX
1165 : static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
1166 : #endif
1167 : #ifdef SINGLE_PRECISION_COMPLEX
1168 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
1169 : #endif
1170 : {
1171 :
1172 : #ifdef DOUBLE_PRECISION_COMPLEX
1173 0 : double* q_dbl = (double*)q;
1174 0 : double* hh_dbl = (double*)hh;
1175 0 : double* s_dbl = (double*)(&s);
1176 : #endif
1177 : #ifdef SINGLE_PRECISION_COMPLEX
1178 81792 : float* q_dbl = (float*)q;
1179 81792 : float* hh_dbl = (float*)hh;
1180 81792 : float* s_dbl = (float*)(&s);
1181 : #endif
1182 :
1183 : __AVX512_DATATYPE x1, x2;
1184 : __AVX512_DATATYPE y1, y2;
1185 : __AVX512_DATATYPE q1, q2;
1186 : __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
1187 : __AVX512_DATATYPE tmp1, tmp2;
1188 81792 : int i=0;
1189 :
1190 : #ifdef DOUBLE_PRECISION_COMPLEX
1191 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
1192 : #endif
1193 : #ifdef SINGLE_PRECISION_COMPLEX
1194 81792 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
1195 : #endif
1196 :
1197 163584 : x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
1198 :
1199 163584 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
1200 163584 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
1201 :
1202 81792 : y1 = _AVX512_LOAD(&q_dbl[0]);
1203 :
1204 81792 : tmp1 = _AVX512_MUL(h2_imag, x1);
1205 :
1206 245376 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1207 :
1208 2535552 : for (i = 2; i < nb; i++)
1209 : {
1210 4907520 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
1211 4907520 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
1212 :
1213 4907520 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
1214 4907520 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
1215 :
1216 2453760 : tmp1 = _AVX512_MUL(h1_imag, q1);
1217 :
1218 7361280 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1219 :
1220 4907520 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
1221 4907520 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
1222 :
1223 2453760 : tmp1 = _AVX512_MUL(h2_imag, q1);
1224 :
1225 7361280 : y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1226 :
1227 : }
1228 :
1229 163584 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
1230 163584 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
1231 :
1232 163584 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
1233 :
1234 81792 : tmp1 = _AVX512_MUL(h1_imag, q1);
1235 :
1236 245376 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1237 :
1238 163584 : h1_real = _AVX512_SET1(hh_dbl[0]);
1239 163584 : h1_imag = _AVX512_SET1(hh_dbl[1]);
1240 :
1241 : #ifdef DOUBLE_PRECISION_COMPLEX
1242 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1243 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1244 : #endif
1245 : #ifdef SINGLE_PRECISION_COMPLEX
1246 163584 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1247 163584 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1248 : #endif
1249 :
1250 :
1251 81792 : tmp1 = _AVX512_MUL(h1_imag, x1);
1252 :
1253 163584 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1254 :
1255 163584 : h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
1256 163584 : h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
1257 163584 : h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
1258 163584 : h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
1259 :
1260 : #ifdef DOUBLE_PRECISION_COMPLEX
1261 0 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1262 0 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1263 : #endif
1264 : #ifdef SINGLE_PRECISION_COMPLEX
1265 163584 : h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1266 163584 : h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1267 : #endif
1268 :
1269 : #ifdef DOUBLE_PRECISION_COMPLEX
1270 0 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
1271 0 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
1272 : #endif
1273 : #ifdef SINGLE_PRECISION_COMPLEX
1274 163584 : h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
1275 163584 : h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
1276 : #endif
1277 :
1278 : #ifdef DOUBLE_PRECISION_COMPLEX
1279 0 : tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
1280 0 : s_dbl[1], s_dbl[0],
1281 0 : s_dbl[1], s_dbl[0],
1282 0 : s_dbl[1], s_dbl[0]);
1283 : #endif
1284 : #ifdef SINGLE_PRECISION_COMPLEX
1285 163584 : tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
1286 : #endif
1287 :
1288 81792 : tmp1 = _AVX512_MUL(h2_imag, tmp2);
1289 :
1290 163584 : tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1291 :
1292 : _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
1293 :
1294 163584 : h2_real = _AVX512_SET1(s_dbl[0]);
1295 163584 : h2_imag = _AVX512_SET1(s_dbl[1]);
1296 :
1297 81792 : tmp1 = _AVX512_MUL(h1_imag, y1);
1298 :
1299 163584 : y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1300 :
1301 81792 : tmp1 = _AVX512_MUL(h2_imag, x1);
1302 :
1303 245376 : y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1304 :
1305 81792 : q1 = _AVX512_LOAD(&q_dbl[0]);
1306 :
1307 81792 : q1 = _AVX512_ADD(q1, y1);
1308 :
1309 : _AVX512_STORE(&q_dbl[0], q1);
1310 :
1311 163584 : h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
1312 163584 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
1313 :
1314 163584 : q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
1315 :
1316 81792 : q1 = _AVX512_ADD(q1, x1);
1317 :
1318 81792 : tmp1 = _AVX512_MUL(h2_imag, y1);
1319 :
1320 245376 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1321 :
1322 81792 : _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
1323 :
1324 2535552 : for (i = 2; i < nb; i++)
1325 : {
1326 4907520 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
1327 :
1328 4907520 : h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
1329 4907520 : h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
1330 :
1331 2453760 : tmp1 = _AVX512_MUL(h1_imag, x1);
1332 :
1333 7361280 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1334 :
1335 4907520 : h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
1336 4907520 : h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
1337 :
1338 2453760 : tmp1 = _AVX512_MUL(h2_imag, y1);
1339 :
1340 7361280 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1341 :
1342 2453760 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
1343 : }
1344 163584 : h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
1345 163584 : h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
1346 :
1347 163584 : q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
1348 :
1349 81792 : tmp1 = _AVX512_MUL(h1_imag, x1);
1350 :
1351 245376 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1352 :
1353 81792 : _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1354 : }
1355 :
|