Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46 : // --------------------------------------------------------------------------------------------------
47 :
48 :
49 : #include "config-f90.h"
50 :
51 : #include <complex.h>
52 : #include <x86intrin.h>
53 : #include <stdio.h>
54 : #include <stdlib.h>
55 :
56 : #define __forceinline __attribute__((always_inline))
57 :
58 : #ifdef DOUBLE_PRECISION_COMPLEX
59 : #define __AVX512_DATATYPE __m512d
60 : #define _AVX512_LOAD _mm512_load_pd
61 : #define _AVX512_STORE _mm512_store_pd
62 : #define _AVX512_SET1 _mm512_set1_pd
63 : #define _AVX512_MUL _mm512_mul_pd
64 : #define _AVX512_ADD _mm512_add_pd
65 : #define _AVX512_SHUFFLE _mm512_shuffle_pd
66 : #define _AVX512_XOR _mm512_xor_pd
67 : #define _AVX512_XOR_EPI _mm512_xor_epi64
68 : #define _SHUFFLE 0x55
69 :
70 : #ifdef HAVE_AVX512
71 :
72 : #define __ELPA_USE_FMA__
73 : #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
74 : #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
75 :
76 : #endif
77 :
78 : #define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
79 : #define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
80 : #endif /* DOUBLE_PRECISION_COMPLEX */
81 :
82 : #ifdef SINGLE_PRECISION_COMPLEX
83 : #define __AVX512_DATATYPE __m512
84 : #define _AVX512_LOAD _mm512_load_ps
85 : #define _AVX512_STORE _mm512_store_ps
86 : #define _AVX512_SET1 _mm512_set1_ps
87 : #define _AVX512_MUL _mm512_mul_ps
88 : #define _AVX512_ADD _mm512_add_ps
89 : #define _AVX512_SHUFFLE _mm512_shuffle_ps
90 : #define _AVX512_XOR _mm512_xor_ps
91 : #define _AVX512_XOR_EPI _mm512_xor_epi32
92 : #define _SHUFFLE 0xb1
93 :
94 : #ifdef HAVE_AVX512
95 :
96 : #define __ELPA_USE_FMA__
97 : #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
98 : #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
99 :
100 : #endif
101 :
102 : #define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
103 : #define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
104 : #endif /* SINGLE_PRECISION_COMPLEX */
105 :
106 :
107 : //Forward declaration
108 : #ifdef DOUBLE_PRECISION_COMPLEX
109 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
110 : static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
111 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
112 : static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
113 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
114 : static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
115 : #endif
116 :
117 : #ifdef SINGLE_PRECISION_COMPLEX
118 : static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
119 : static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
120 : static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
121 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
122 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
123 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
124 : #endif
125 :
126 :
127 : /*
128 : !f>#if defined(HAVE_AVX512)
129 : !f> interface
130 : !f> subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
131 : !f> bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
132 : !f> use, intrinsic :: iso_c_binding
133 : !f> integer(kind=c_int) :: pnb, pnq, pldq
134 : !f> ! complex(kind=c_double_complex) :: q(*)
135 : !f> type(c_ptr), value :: q
136 : !f> complex(kind=c_double_complex) :: hh(pnb,2)
137 : !f> end subroutine
138 : !f> end interface
139 : !f>#endif
140 : */
141 : /*
142 : !f>#if defined(HAVE_AVX512)
143 : !f> interface
144 : !f> subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
145 : !f> bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
146 : !f> use, intrinsic :: iso_c_binding
147 : !f> integer(kind=c_int) :: pnb, pnq, pldq
148 : !f> ! complex(kind=c_float_complex) :: q(*)
149 : !f> type(c_ptr), value :: q
150 : !f> complex(kind=c_float_complex) :: hh(pnb,2)
151 : !f> end subroutine
152 : !f> end interface
153 : !f>#endif
154 : */
155 :
156 : #ifdef DOUBLE_PRECISION_COMPLEX
157 70951936 : void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
158 : #endif
159 : #ifdef SINGLE_PRECISION_COMPLEX
160 24121856 : void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
161 : #endif
162 : {
163 : int i;
164 95073792 : int nb = *pnb;
165 95073792 : int nq = *pldq;
166 95073792 : int ldq = *pldq;
167 : int worked_on;
168 : //int ldh = *pldh;
169 :
170 95073792 : worked_on = 0;
171 :
172 : #ifdef DOUBLE_PRECISION_COMPLEX
173 210028544 : for (i = 0; i < nq-20; i+=24)
174 : {
175 139076608 : hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
176 139076608 : worked_on += 24;
177 : }
178 : #endif
179 :
180 : #ifdef SINGLE_PRECISION_COMPLEX
181 47536896 : for (i = 0; i < nq-40; i+=48)
182 : {
183 23415040 : hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
184 23415040 : worked_on += 48;
185 : }
186 : #endif
187 95073792 : if (nq == i)
188 : {
189 91539712 : return;
190 : }
191 :
192 : #ifdef DOUBLE_PRECISION_COMPLEX
193 2827264 : if (nq-i == 20)
194 : {
195 0 : hh_trafo_complex_kernel_20_AVX512_1hv_double(&q[i], hh, nb, ldq);
196 0 : worked_on += 20;
197 : }
198 : #endif
199 :
200 : #ifdef SINGLE_PRECISION_COMPLEX
201 706816 : if (nq-i == 40)
202 : {
203 706816 : hh_trafo_complex_kernel_40_AVX512_1hv_single(&q[i], hh, nb, ldq);
204 706816 : worked_on += 40;
205 : }
206 : #endif
207 :
208 : #ifdef DOUBLE_PRECISION_COMPLEX
209 2827264 : if (nq-i == 16)
210 : {
211 2827264 : hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
212 2827264 : worked_on += 16;
213 : }
214 : #endif
215 :
216 : #ifdef SINGLE_PRECISION_COMPLEX
217 706816 : if (nq-i == 32)
218 : {
219 0 : hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
220 0 : worked_on += 32;
221 : }
222 : #endif
223 :
224 : #ifdef DOUBLE_PRECISION_COMPLEX
225 2827264 : if (nq-i == 12)
226 : {
227 0 : hh_trafo_complex_kernel_12_AVX512_1hv_double(&q[i], hh, nb, ldq);
228 0 : worked_on += 12;
229 : }
230 : #endif
231 :
232 : #ifdef SINGLE_PRECISION_COMPLEX
233 706816 : if (nq-i == 24)
234 : {
235 0 : hh_trafo_complex_kernel_24_AVX512_1hv_single(&q[i], hh, nb, ldq);
236 0 : worked_on += 24;
237 : }
238 : #endif
239 :
240 : #ifdef DOUBLE_PRECISION_COMPLEX
241 2827264 : if (nq-i == 8)
242 : {
243 0 : hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
244 0 : worked_on += 8;
245 : }
246 : #endif
247 :
248 : #ifdef SINGLE_PRECISION_COMPLEX
249 706816 : if (nq-i == 16)
250 : {
251 0 : hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
252 0 : worked_on += 16;
253 : }
254 : #endif
255 :
256 : #ifdef DOUBLE_PRECISION_COMPLEX
257 2827264 : if (nq-i == 4)
258 : {
259 0 : hh_trafo_complex_kernel_4_AVX512_1hv_double(&q[i], hh, nb, ldq);
260 0 : worked_on += 4;
261 : }
262 : #endif
263 :
264 : #ifdef SINGLE_PRECISION_COMPLEX
265 706816 : if (nq-i == 8)
266 : {
267 0 : hh_trafo_complex_kernel_8_AVX512_1hv_single(&q[i], hh, nb, ldq);
268 0 : worked_on += 8;
269 : }
270 : #endif
271 : #ifdef WITH_DEBUG
272 : if (worked_on != nq)
273 : {
274 : printf("Error in complex AVX512 BLOCK 1 kernel \n");
275 : abort();
276 : }
277 : #endif
278 : }
279 :
280 : #ifdef DOUBLE_PRECISION_COMPLEX
281 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
282 : #endif
283 : #ifdef SINGLE_PRECISION_COMPLEX
284 : static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
285 : #endif
286 : {
287 :
288 : #ifdef DOUBLE_PRECISION_COMPLEX
289 139076608 : double* q_dbl = (double*)q;
290 139076608 : double* hh_dbl = (double*)hh;
291 : #endif
292 : #ifdef SINGLE_PRECISION_COMPLEX
293 23415040 : float* q_dbl = (float*)q;
294 23415040 : float* hh_dbl = (float*)hh;
295 : #endif
296 : __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
297 : __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
298 : __AVX512_DATATYPE h1_real, h1_imag;
299 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
300 162491648 : int i=0;
301 :
302 : #ifdef DOUBLE_PRECISION_COMPLEX
303 139076608 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
304 : #endif
305 : #ifdef SINGLE_PRECISION_COMPLEX
306 23415040 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
307 : #endif
308 :
309 : #ifdef DOUBLE_PRECISION_COMPLEX
310 : #define offset 8
311 : #endif
312 : #ifdef SINGLE_PRECISION_COMPLEX
313 : #define offset 16
314 : #endif
315 :
316 :
317 162491648 : x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1, 2, 3, 4
318 324983296 : x2 = _AVX512_LOAD(&q_dbl[offset]); // complex 5, 6, 7, 8
319 324983296 : x3 = _AVX512_LOAD(&q_dbl[2*offset]); // complex 9, 10, 11, 12
320 324983296 : x4 = _AVX512_LOAD(&q_dbl[3*offset]); // complex 13, 14, 15, 16
321 324983296 : x5 = _AVX512_LOAD(&q_dbl[4*offset]); // complex 17, 18, 19, 20
322 324983296 : x6 = _AVX512_LOAD(&q_dbl[5*offset]); // complex 21, 22, 23, 24
323 :
324 5191745536 : for (i = 1; i < nb; i++)
325 : {
326 10058507776 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
327 10058507776 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
328 :
329 10058507776 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
330 10058507776 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
331 10058507776 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
332 10058507776 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
333 10058507776 : q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
334 10058507776 : q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
335 :
336 5029253888 : tmp1 = _AVX512_MUL(h1_imag, q1);
337 :
338 15087761664 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
339 :
340 5029253888 : tmp2 = _AVX512_MUL(h1_imag, q2);
341 :
342 15087761664 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
343 :
344 5029253888 : tmp3 = _AVX512_MUL(h1_imag, q3);
345 :
346 15087761664 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
347 :
348 5029253888 : tmp4 = _AVX512_MUL(h1_imag, q4);
349 :
350 15087761664 : x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
351 :
352 5029253888 : tmp5 = _AVX512_MUL(h1_imag, q5);
353 :
354 15087761664 : x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
355 :
356 5029253888 : tmp6 = _AVX512_MUL(h1_imag, q6);
357 :
358 15087761664 : x6 = _AVX512_ADD(x6, _AVX512_FMSUBADD(h1_real, q6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
359 : }
360 :
361 324983296 : h1_real = _AVX512_SET1(hh_dbl[0]);
362 324983296 : h1_imag = _AVX512_SET1(hh_dbl[1]);
363 :
364 : #ifdef DOUBLE_PRECISION_COMPLEX
365 278153216 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
366 278153216 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
367 : #endif
368 : #ifdef SINGLE_PRECISION_COMPLEX
369 46830080 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
370 46830080 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
371 : #endif
372 :
373 162491648 : tmp1 = _AVX512_MUL(h1_imag, x1);
374 :
375 324983296 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
376 :
377 162491648 : tmp2 = _AVX512_MUL(h1_imag, x2);
378 :
379 324983296 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
380 :
381 162491648 : tmp3 = _AVX512_MUL(h1_imag, x3);
382 :
383 324983296 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
384 :
385 162491648 : tmp4 = _AVX512_MUL(h1_imag, x4);
386 :
387 324983296 : x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
388 :
389 162491648 : tmp5 = _AVX512_MUL(h1_imag, x5);
390 :
391 324983296 : x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
392 :
393 162491648 : tmp6 = _AVX512_MUL(h1_imag, x6);
394 :
395 324983296 : x6 = _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE));
396 :
397 162491648 : q1 = _AVX512_LOAD(&q_dbl[0]);
398 324983296 : q2 = _AVX512_LOAD(&q_dbl[offset]);
399 324983296 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
400 324983296 : q4 = _AVX512_LOAD(&q_dbl[3*offset]);
401 324983296 : q5 = _AVX512_LOAD(&q_dbl[4*offset]);
402 324983296 : q6 = _AVX512_LOAD(&q_dbl[5*offset]);
403 :
404 162491648 : q1 = _AVX512_ADD(q1, x1);
405 162491648 : q2 = _AVX512_ADD(q2, x2);
406 162491648 : q3 = _AVX512_ADD(q3, x3);
407 162491648 : q4 = _AVX512_ADD(q4, x4);
408 162491648 : q5 = _AVX512_ADD(q5, x5);
409 162491648 : q6 = _AVX512_ADD(q6, x6);
410 :
411 : _AVX512_STORE(&q_dbl[0], q1);
412 162491648 : _AVX512_STORE(&q_dbl[offset], q2);
413 162491648 : _AVX512_STORE(&q_dbl[2*offset], q3);
414 162491648 : _AVX512_STORE(&q_dbl[3*offset], q4);
415 162491648 : _AVX512_STORE(&q_dbl[4*offset], q5);
416 162491648 : _AVX512_STORE(&q_dbl[5*offset], q6);
417 :
418 5191745536 : for (i = 1; i < nb; i++)
419 : {
420 10058507776 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
421 10058507776 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
422 :
423 10058507776 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
424 10058507776 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
425 10058507776 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
426 10058507776 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
427 10058507776 : q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
428 10058507776 : q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
429 :
430 5029253888 : tmp1 = _AVX512_MUL(h1_imag, x1);
431 :
432 15087761664 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
433 :
434 5029253888 : tmp2 = _AVX512_MUL(h1_imag, x2);
435 :
436 15087761664 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
437 :
438 5029253888 : tmp3 = _AVX512_MUL(h1_imag, x3);
439 :
440 15087761664 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
441 :
442 5029253888 : tmp4 = _AVX512_MUL(h1_imag, x4);
443 :
444 15087761664 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
445 :
446 5029253888 : tmp5 = _AVX512_MUL(h1_imag, x5);
447 :
448 15087761664 : q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
449 :
450 5029253888 : tmp6 = _AVX512_MUL(h1_imag, x6);
451 :
452 15087761664 : q6 = _AVX512_ADD(q6, _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
453 :
454 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
455 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
456 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
457 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
458 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
459 5029253888 : _AVX512_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
460 : }
461 : }
462 :
463 : #ifdef DOUBLE_PRECISION_COMPLEX
464 : static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
465 : #endif
466 : #ifdef SINGLE_PRECISION_COMPLEX
467 : static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
468 : #endif
469 : {
470 :
471 : #ifdef DOUBLE_PRECISION_COMPLEX
472 0 : double* q_dbl = (double*)q;
473 0 : double* hh_dbl = (double*)hh;
474 : #endif
475 : #ifdef SINGLE_PRECISION_COMPLEX
476 706816 : float* q_dbl = (float*)q;
477 706816 : float* hh_dbl = (float*)hh;
478 : #endif
479 : __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
480 : __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
481 : __AVX512_DATATYPE h1_real, h1_imag;
482 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
483 706816 : int i=0;
484 :
485 : #ifdef DOUBLE_PRECISION_COMPLEX
486 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
487 : #endif
488 : #ifdef SINGLE_PRECISION_COMPLEX
489 706816 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
490 : #endif
491 :
492 : #ifdef DOUBLE_PRECISION_COMPLEX
493 : #define offset 8
494 : #endif
495 : #ifdef SINGLE_PRECISION_COMPLEX
496 : #define offset 16
497 : #endif
498 :
499 :
500 706816 : x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1, 2, 3, 4
501 1413632 : x2 = _AVX512_LOAD(&q_dbl[offset]); // complex 5, 6, 7, 8
502 1413632 : x3 = _AVX512_LOAD(&q_dbl[2*offset]); // complex 9, 10, 11, 12
503 1413632 : x4 = _AVX512_LOAD(&q_dbl[3*offset]); // complex 13, 14, 15, 16
504 1413632 : x5 = _AVX512_LOAD(&q_dbl[4*offset]); // complex 17, 18, 19, 20
505 :
506 21020672 : for (i = 1; i < nb; i++)
507 : {
508 40627712 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
509 40627712 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
510 :
511 40627712 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
512 40627712 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
513 40627712 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
514 40627712 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
515 40627712 : q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
516 :
517 20313856 : tmp1 = _AVX512_MUL(h1_imag, q1);
518 :
519 60941568 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
520 :
521 20313856 : tmp2 = _AVX512_MUL(h1_imag, q2);
522 :
523 60941568 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
524 :
525 20313856 : tmp3 = _AVX512_MUL(h1_imag, q3);
526 :
527 60941568 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
528 :
529 20313856 : tmp4 = _AVX512_MUL(h1_imag, q4);
530 :
531 60941568 : x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
532 :
533 20313856 : tmp5 = _AVX512_MUL(h1_imag, q5);
534 :
535 60941568 : x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
536 :
537 : }
538 :
539 1413632 : h1_real = _AVX512_SET1(hh_dbl[0]);
540 1413632 : h1_imag = _AVX512_SET1(hh_dbl[1]);
541 :
542 : #ifdef DOUBLE_PRECISION_COMPLEX
543 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
544 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
545 : #endif
546 : #ifdef SINGLE_PRECISION_COMPLEX
547 1413632 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
548 1413632 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
549 : #endif
550 :
551 706816 : tmp1 = _AVX512_MUL(h1_imag, x1);
552 :
553 1413632 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
554 :
555 706816 : tmp2 = _AVX512_MUL(h1_imag, x2);
556 :
557 1413632 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
558 :
559 706816 : tmp3 = _AVX512_MUL(h1_imag, x3);
560 :
561 1413632 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
562 :
563 706816 : tmp4 = _AVX512_MUL(h1_imag, x4);
564 :
565 1413632 : x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
566 :
567 706816 : tmp5 = _AVX512_MUL(h1_imag, x5);
568 :
569 1413632 : x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
570 :
571 706816 : q1 = _AVX512_LOAD(&q_dbl[0]);
572 1413632 : q2 = _AVX512_LOAD(&q_dbl[offset]);
573 1413632 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
574 1413632 : q4 = _AVX512_LOAD(&q_dbl[3*offset]);
575 1413632 : q5 = _AVX512_LOAD(&q_dbl[4*offset]);
576 :
577 706816 : q1 = _AVX512_ADD(q1, x1);
578 706816 : q2 = _AVX512_ADD(q2, x2);
579 706816 : q3 = _AVX512_ADD(q3, x3);
580 706816 : q4 = _AVX512_ADD(q4, x4);
581 706816 : q5 = _AVX512_ADD(q5, x5);
582 :
583 : _AVX512_STORE(&q_dbl[0], q1);
584 706816 : _AVX512_STORE(&q_dbl[offset], q2);
585 706816 : _AVX512_STORE(&q_dbl[2*offset], q3);
586 706816 : _AVX512_STORE(&q_dbl[3*offset], q4);
587 706816 : _AVX512_STORE(&q_dbl[4*offset], q5);
588 :
589 21020672 : for (i = 1; i < nb; i++)
590 : {
591 40627712 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
592 40627712 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
593 :
594 40627712 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
595 40627712 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
596 40627712 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
597 40627712 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
598 40627712 : q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
599 :
600 20313856 : tmp1 = _AVX512_MUL(h1_imag, x1);
601 :
602 60941568 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
603 :
604 20313856 : tmp2 = _AVX512_MUL(h1_imag, x2);
605 :
606 60941568 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
607 :
608 20313856 : tmp3 = _AVX512_MUL(h1_imag, x3);
609 :
610 60941568 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
611 :
612 20313856 : tmp4 = _AVX512_MUL(h1_imag, x4);
613 :
614 60941568 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
615 :
616 20313856 : tmp5 = _AVX512_MUL(h1_imag, x5);
617 :
618 60941568 : q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
619 :
620 20313856 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
621 20313856 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
622 20313856 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
623 20313856 : _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
624 20313856 : _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
625 : }
626 : }
627 :
628 :
629 : #ifdef DOUBLE_PRECISION_COMPLEX
630 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
631 : #endif
632 : #ifdef SINGLE_PRECISION_COMPLEX
633 : static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
634 : #endif
635 : {
636 :
637 : #ifdef DOUBLE_PRECISION_COMPLEX
638 2827264 : double* q_dbl = (double*)q;
639 2827264 : double* hh_dbl = (double*)hh;
640 : #endif
641 : #ifdef SINGLE_PRECISION_COMPLEX
642 0 : float* q_dbl = (float*)q;
643 0 : float* hh_dbl = (float*)hh;
644 : #endif
645 :
646 : __AVX512_DATATYPE x1, x2, x3, x4;
647 : __AVX512_DATATYPE q1, q2, q3, q4;
648 : __AVX512_DATATYPE h1_real, h1_imag;
649 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
650 2827264 : int i=0;
651 :
652 : #ifdef DOUBLE_PRECISION_COMPLEX
653 2827264 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
654 : #endif
655 : #ifdef SINGLE_PRECISION_COMPLEX
656 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
657 : #endif
658 :
659 2827264 : x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1 2 3 4
660 5654528 : x2 = _AVX512_LOAD(&q_dbl[offset]);
661 5654528 : x3 = _AVX512_LOAD(&q_dbl[2*offset]);
662 5654528 : x4 = _AVX512_LOAD(&q_dbl[3*offset]); // comlex 13 14 15 16
663 :
664 84082688 : for (i = 1; i < nb; i++)
665 : {
666 162510848 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
667 162510848 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
668 :
669 162510848 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
670 162510848 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
671 162510848 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
672 162510848 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
673 :
674 81255424 : tmp1 = _AVX512_MUL(h1_imag, q1);
675 :
676 243766272 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
677 :
678 81255424 : tmp2 = _AVX512_MUL(h1_imag, q2);
679 :
680 243766272 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
681 :
682 81255424 : tmp3 = _AVX512_MUL(h1_imag, q3);
683 :
684 243766272 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
685 :
686 81255424 : tmp4 = _AVX512_MUL(h1_imag, q4);
687 :
688 243766272 : x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
689 : }
690 :
691 5654528 : h1_real = _AVX512_SET1(hh_dbl[0]);
692 5654528 : h1_imag = _AVX512_SET1(hh_dbl[1]);
693 :
694 : #ifdef DOUBLE_PRECISION_COMPLEX
695 5654528 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
696 5654528 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
697 : #endif
698 : #ifdef SINGLE_PRECISION_COMPLEX
699 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
700 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
701 : #endif
702 :
703 2827264 : tmp1 = _AVX512_MUL(h1_imag, x1);
704 :
705 5654528 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
706 :
707 2827264 : tmp2 = _AVX512_MUL(h1_imag, x2);
708 :
709 5654528 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
710 :
711 2827264 : tmp3 = _AVX512_MUL(h1_imag, x3);
712 :
713 5654528 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
714 :
715 2827264 : tmp4 = _AVX512_MUL(h1_imag, x4);
716 :
717 5654528 : x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
718 :
719 2827264 : q1 = _AVX512_LOAD(&q_dbl[0]);
720 5654528 : q2 = _AVX512_LOAD(&q_dbl[offset]);
721 5654528 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
722 5654528 : q4 = _AVX512_LOAD(&q_dbl[3*offset]);
723 :
724 2827264 : q1 = _AVX512_ADD(q1, x1);
725 2827264 : q2 = _AVX512_ADD(q2, x2);
726 2827264 : q3 = _AVX512_ADD(q3, x3);
727 2827264 : q4 = _AVX512_ADD(q4, x4);
728 :
729 : _AVX512_STORE(&q_dbl[0], q1);
730 2827264 : _AVX512_STORE(&q_dbl[offset], q2);
731 2827264 : _AVX512_STORE(&q_dbl[2*offset], q3);
732 2827264 : _AVX512_STORE(&q_dbl[3*offset], q4);
733 :
734 84082688 : for (i = 1; i < nb; i++)
735 : {
736 162510848 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
737 162510848 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
738 :
739 162510848 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
740 162510848 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
741 162510848 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
742 162510848 : q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
743 :
744 81255424 : tmp1 = _AVX512_MUL(h1_imag, x1);
745 :
746 243766272 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
747 :
748 81255424 : tmp2 = _AVX512_MUL(h1_imag, x2);
749 :
750 243766272 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
751 :
752 81255424 : tmp3 = _AVX512_MUL(h1_imag, x3);
753 :
754 243766272 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
755 :
756 81255424 : tmp4 = _AVX512_MUL(h1_imag, x4);
757 :
758 243766272 : q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
759 :
760 81255424 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
761 81255424 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
762 81255424 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
763 81255424 : _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
764 : }
765 : }
766 :
767 : #ifdef DOUBLE_PRECISION_COMPLEX
768 : static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
769 : #endif
770 : #ifdef SINGLE_PRECISION_COMPLEX
771 : static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
772 : #endif
773 : {
774 :
775 : #ifdef DOUBLE_PRECISION_COMPLEX
776 0 : double* q_dbl = (double*)q;
777 0 : double* hh_dbl = (double*)hh;
778 : #endif
779 : #ifdef SINGLE_PRECISION_COMPLEX
780 0 : float* q_dbl = (float*)q;
781 0 : float* hh_dbl = (float*)hh;
782 : #endif
783 :
784 : __AVX512_DATATYPE x1, x2, x3, x4;
785 : __AVX512_DATATYPE q1, q2, q3, q4;
786 : __AVX512_DATATYPE h1_real, h1_imag;
787 : __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
788 0 : int i=0;
789 :
790 : #ifdef DOUBLE_PRECISION_COMPLEX
791 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
792 : #endif
793 : #ifdef SINGLE_PRECISION_COMPLEX
794 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
795 : #endif
796 :
797 0 : x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1 2 3 4
798 0 : x2 = _AVX512_LOAD(&q_dbl[offset]);
799 0 : x3 = _AVX512_LOAD(&q_dbl[2*offset]);
800 :
801 0 : for (i = 1; i < nb; i++)
802 : {
803 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
804 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
805 :
806 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
807 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
808 0 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
809 :
810 0 : tmp1 = _AVX512_MUL(h1_imag, q1);
811 :
812 0 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
813 :
814 0 : tmp2 = _AVX512_MUL(h1_imag, q2);
815 :
816 0 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
817 :
818 0 : tmp3 = _AVX512_MUL(h1_imag, q3);
819 :
820 0 : x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
821 :
822 : }
823 :
824 0 : h1_real = _AVX512_SET1(hh_dbl[0]);
825 0 : h1_imag = _AVX512_SET1(hh_dbl[1]);
826 :
827 : #ifdef DOUBLE_PRECISION_COMPLEX
828 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
829 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
830 : #endif
831 : #ifdef SINGLE_PRECISION_COMPLEX
832 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
833 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
834 : #endif
835 :
836 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
837 :
838 0 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
839 :
840 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
841 :
842 0 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
843 :
844 0 : tmp3 = _AVX512_MUL(h1_imag, x3);
845 :
846 0 : x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
847 :
848 0 : q1 = _AVX512_LOAD(&q_dbl[0]);
849 0 : q2 = _AVX512_LOAD(&q_dbl[offset]);
850 0 : q3 = _AVX512_LOAD(&q_dbl[2*offset]);
851 :
852 0 : q1 = _AVX512_ADD(q1, x1);
853 0 : q2 = _AVX512_ADD(q2, x2);
854 0 : q3 = _AVX512_ADD(q3, x3);
855 :
856 : _AVX512_STORE(&q_dbl[0], q1);
857 0 : _AVX512_STORE(&q_dbl[offset], q2);
858 0 : _AVX512_STORE(&q_dbl[2*offset], q3);
859 :
860 0 : for (i = 1; i < nb; i++)
861 : {
862 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
863 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
864 :
865 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
866 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
867 0 : q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
868 :
869 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
870 :
871 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
872 :
873 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
874 :
875 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
876 :
877 0 : tmp3 = _AVX512_MUL(h1_imag, x3);
878 :
879 0 : q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
880 :
881 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
882 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
883 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
884 : }
885 : }
886 :
887 :
888 : #ifdef DOUBLE_PRECISION_COMPLEX
889 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
890 : #endif
891 : #ifdef SINGLE_PRECISION_COMPLEX
892 : static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
893 : #endif
894 : {
895 :
896 : #ifdef DOUBLE_PRECISION_COMPLEX
897 0 : double* q_dbl = (double*)q;
898 0 : double* hh_dbl = (double*)hh;
899 : #endif
900 : #ifdef SINGLE_PRECISION_COMPLEX
901 0 : float* q_dbl = (float*)q;
902 0 : float* hh_dbl = (float*)hh;
903 : #endif
904 : __AVX512_DATATYPE x1, x2;
905 : __AVX512_DATATYPE q1, q2;
906 : __AVX512_DATATYPE h1_real, h1_imag;
907 : __AVX512_DATATYPE tmp1, tmp2;
908 0 : int i=0;
909 :
910 : #ifdef DOUBLE_PRECISION_COMPLEX
911 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
912 : #endif
913 : #ifdef SINGLE_PRECISION_COMPLEX
914 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
915 : #endif
916 :
917 0 : x1 = _AVX512_LOAD(&q_dbl[0]);
918 0 : x2 = _AVX512_LOAD(&q_dbl[offset]);
919 :
920 0 : for (i = 1; i < nb; i++)
921 : {
922 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
923 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
924 :
925 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
926 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
927 :
928 0 : tmp1 = _AVX512_MUL(h1_imag, q1);
929 0 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
930 0 : tmp2 = _AVX512_MUL(h1_imag, q2);
931 0 : x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
932 : }
933 :
934 0 : h1_real = _AVX512_SET1(hh_dbl[0]);
935 0 : h1_imag = _AVX512_SET1(hh_dbl[1]);
936 :
937 : #ifdef DOUBLE_PRECISION_COMPLEX
938 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
939 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
940 : #endif
941 : #ifdef SINGLE_PRECISION_COMPLEX
942 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
943 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
944 : #endif
945 :
946 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
947 0 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
948 :
949 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
950 :
951 0 : x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
952 :
953 0 : q1 = _AVX512_LOAD(&q_dbl[0]);
954 0 : q2 = _AVX512_LOAD(&q_dbl[offset]);
955 :
956 0 : q1 = _AVX512_ADD(q1, x1);
957 0 : q2 = _AVX512_ADD(q2, x2);
958 :
959 : _AVX512_STORE(&q_dbl[0], q1);
960 0 : _AVX512_STORE(&q_dbl[offset], q2);
961 :
962 0 : for (i = 1; i < nb; i++)
963 : {
964 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
965 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
966 :
967 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
968 0 : q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
969 :
970 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
971 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
972 :
973 0 : tmp2 = _AVX512_MUL(h1_imag, x2);
974 :
975 0 : q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
976 :
977 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
978 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
979 : }
980 : }
981 :
982 :
983 : #ifdef DOUBLE_PRECISION_COMPLEX
984 : static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
985 : #endif
986 : #ifdef SINGLE_PRECISION_COMPLEX
987 : static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
988 : #endif
989 : {
990 :
991 : #ifdef DOUBLE_PRECISION_COMPLEX
992 0 : double* q_dbl = (double*)q;
993 0 : double* hh_dbl = (double*)hh;
994 : #endif
995 : #ifdef SINGLE_PRECISION_COMPLEX
996 0 : float* q_dbl = (float*)q;
997 0 : float* hh_dbl = (float*)hh;
998 : #endif
999 : __AVX512_DATATYPE x1, x2;
1000 : __AVX512_DATATYPE q1, q2;
1001 : __AVX512_DATATYPE h1_real, h1_imag;
1002 : __AVX512_DATATYPE tmp1, tmp2;
1003 0 : int i=0;
1004 :
1005 : #ifdef DOUBLE_PRECISION_COMPLEX
1006 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1007 : #endif
1008 : #ifdef SINGLE_PRECISION_COMPLEX
1009 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
1010 : #endif
1011 :
1012 0 : x1 = _AVX512_LOAD(&q_dbl[0]);
1013 :
1014 0 : for (i = 1; i < nb; i++)
1015 : {
1016 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
1017 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
1018 :
1019 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
1020 :
1021 0 : tmp1 = _AVX512_MUL(h1_imag, q1);
1022 0 : x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1023 : }
1024 :
1025 0 : h1_real = _AVX512_SET1(hh_dbl[0]);
1026 0 : h1_imag = _AVX512_SET1(hh_dbl[1]);
1027 :
1028 : #ifdef DOUBLE_PRECISION_COMPLEX
1029 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1030 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1031 : #endif
1032 : #ifdef SINGLE_PRECISION_COMPLEX
1033 0 : h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1034 0 : h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1035 : #endif
1036 :
1037 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
1038 0 : x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1039 :
1040 0 : q1 = _AVX512_LOAD(&q_dbl[0]);
1041 :
1042 0 : q1 = _AVX512_ADD(q1, x1);
1043 :
1044 : _AVX512_STORE(&q_dbl[0], q1);
1045 :
1046 0 : for (i = 1; i < nb; i++)
1047 : {
1048 0 : h1_real = _AVX512_SET1(hh_dbl[i*2]);
1049 0 : h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
1050 :
1051 0 : q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
1052 :
1053 0 : tmp1 = _AVX512_MUL(h1_imag, x1);
1054 0 : q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1055 :
1056 0 : _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
1057 : }
1058 : }
1059 :
|