Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46 : // --------------------------------------------------------------------------------------------------
47 :
48 : #include "config-f90.h"
49 :
50 : #include <x86intrin.h>
51 : #include <stdio.h>
52 : #include <stdlib.h>
53 :
54 : #define __forceinline __attribute__((always_inline)) static
55 :
56 : #ifdef DOUBLE_PRECISION_REAL
57 : #define offset 8
58 :
59 : #define __AVX512_DATATYPE __m512d
60 : #define __AVX512i __m512i
61 : #define _AVX512_LOAD _mm512_load_pd
62 : #define _AVX512_STORE _mm512_store_pd
63 : #define _AVX512_SET1 _mm512_set1_pd
64 : #define _AVX512_ADD _mm512_add_pd
65 : #define _AVX512_MUL _mm512_mul_pd
66 :
67 : #ifdef HAVE_AVX512
68 : #define __ELPA_USE_FMA__
69 : #define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
70 : #endif
71 :
72 : #define _AVX512_FMA _mm512_FMA_pd
73 : #endif /* DOUBLE_PRECISION_REAL */
74 :
75 : #ifdef SINGLE_PRECISION_REAL
76 : #define offset 16
77 :
78 : #define __AVX512_DATATYPE __m512
79 : #define __AVX512i __m512i
80 : #define _AVX512_LOAD _mm512_load_ps
81 : #define _AVX512_STORE _mm512_store_ps
82 : #define _AVX512_SET1 _mm512_set1_ps
83 : #define _AVX512_ADD _mm512_add_ps
84 : #define _AVX512_MUL _mm512_mul_ps
85 :
86 : #ifdef HAVE_AVX512
87 : #define __ELPA_USE_FMA__
88 : #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
89 : #endif
90 :
91 : #define _AVX512_FMA _mm512_FMA_ps
92 : #endif /* SINGLE_PRECISION_REAL */
93 :
94 : #ifdef DOUBLE_PRECISION_REAL
95 : //Forward declaration
96 : __forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
97 : __forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
98 : __forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
99 : __forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
100 :
101 : void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
102 : #endif
103 : #ifdef SINGLE_PRECISION_REAL
104 : //Forward declaration
105 : __forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
106 : __forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
107 : __forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
108 : __forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
109 :
110 : void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
111 : #endif
112 :
113 : #ifdef DOUBLE_PRECISION_REAL
114 : /*
115 : !f>#if defined(HAVE_AVX512)
116 : !f> interface
117 : !f> subroutine double_hh_trafo_real_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
118 : !f> bind(C, name="double_hh_trafo_real_avx512_2hv_double")
119 : !f> use, intrinsic :: iso_c_binding
120 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
121 : !f> type(c_ptr), value :: q
122 : !f> real(kind=c_double) :: hh(pnb,6)
123 : !f> end subroutine
124 : !f> end interface
125 : !f>#endif
126 : */
127 : #endif
128 : #ifdef SINGLE_PRECISION_REAL
129 : /*
130 : !f>#if defined(HAVE_AVX512)
131 : !f> interface
132 : !f> subroutine double_hh_trafo_real_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
133 : !f> bind(C, name="double_hh_trafo_real_avx512_2hv_single")
134 : !f> use, intrinsic :: iso_c_binding
135 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
136 : !f> type(c_ptr), value :: q
137 : !f> real(kind=c_float) :: hh(pnb,6)
138 : !f> end subroutine
139 : !f> end interface
140 : !f>#endif
141 : */
142 : #endif
143 :
144 : #ifdef DOUBLE_PRECISION_REAL
145 18762496 : void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
146 : #endif
147 : #ifdef SINGLE_PRECISION_REAL
148 3442752 : void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
149 : #endif
150 : {
151 : int i;
152 22205248 : int nb = *pnb;
153 22205248 : int nq = *pldq;
154 22205248 : int ldq = *pldq;
155 22205248 : int ldh = *pldh;
156 : int worked_on;
157 :
158 22205248 : worked_on = 0;
159 :
160 : // calculating scalar product to compute
161 : // 2 householder vectors simultaneously
162 : #ifdef DOUBLE_PRECISION_REAL
163 18762496 : double s = hh[(ldh)+1]*1.0;
164 : #endif
165 : #ifdef SINGLE_PRECISION_REAL
166 3442752 : float s = hh[(ldh)+1]*1.0f;
167 : #endif
168 : #pragma ivdep
169 1387088064 : for (i = 2; i < nb; i++)
170 : {
171 1364882816 : s += hh[i-1] * hh[(i+ldh)];
172 : }
173 :
174 : // Production level kernel calls with padding
175 : #ifdef DOUBLE_PRECISION_REAL
176 37524992 : for (i = 0; i < nq-24; i+=32)
177 : {
178 18762496 : hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
179 18762496 : worked_on += 32;
180 : }
181 : #endif
182 : #ifdef SINGLE_PRECISION_REAL
183 6885504 : for (i = 0; i < nq-48; i+=64)
184 : {
185 3442752 : hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
186 3442752 : worked_on += 64;
187 : }
188 : #endif
189 22205248 : if (nq == i)
190 : {
191 0 : return;
192 : }
193 : #ifdef DOUBLE_PRECISION_REAL
194 18762496 : if (nq-i == 24)
195 : {
196 0 : hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
197 0 : worked_on += 24;
198 : }
199 : #endif
200 : #ifdef SINGLE_PRECISION_REAL
201 3442752 : if (nq-i == 48)
202 : {
203 0 : hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
204 0 : worked_on += 48;
205 : }
206 : #endif
207 : #ifdef DOUBLE_PRECISION_REAL
208 18762496 : if (nq-i == 16)
209 : {
210 17571840 : hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
211 17571840 : worked_on += 16;
212 : }
213 : #endif
214 : #ifdef SINGLE_PRECISION_REAL
215 3442752 : if (nq-i == 32)
216 : {
217 3194880 : hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
218 3194880 : worked_on += 32;
219 : }
220 : #endif
221 : #ifdef DOUBLE_PRECISION_REAL
222 18762496 : if (nq-i == 8)
223 : {
224 1190656 : hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
225 1190656 : worked_on += 8;
226 : }
227 : #endif
228 : #ifdef SINGLE_PRECISION_REAL
229 3442752 : if (nq-i == 16)
230 : {
231 247872 : hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
232 247872 : worked_on += 16;
233 : }
234 : #endif
235 :
236 : #ifdef WITH_DEBUG
237 : if (worked_on != nq)
238 : {
239 : printf("Error in AVX512 real BLOCK 2 kernel \n");
240 : abort();
241 : }
242 : #endif
243 : }
244 :
245 : /**
246 : * Unrolled kernel that computes
247 : #ifdef DOUBLE_PRECISION_REAL
248 : * 32 rows of Q simultaneously, a
249 : #endif
250 : #ifdef SINGLE_PRECISION_REAL
251 : * 64 rows of Q simultaneously, a
252 : #endif
253 :
254 : * matrix Vector product with two householder
255 : * vectors + a rank 2 update is performed
256 : */
257 : #ifdef DOUBLE_PRECISION_REAL
258 : __forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
259 : #endif
260 : #ifdef SINGLE_PRECISION_REAL
261 : __forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
262 : #endif
263 : {
264 : /////////////////////////////////////////////////////
265 : // Matrix Vector Multiplication, Q [24 x nb+1] * hh
266 : // hh contains two householder vectors, with offset 1
267 : /////////////////////////////////////////////////////
268 : int i;
269 : // Needed bit mask for floating point sign flip
270 : #ifdef DOUBLE_PRECISION_REAL
271 18762496 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
272 : #endif
273 : #ifdef SINGLE_PRECISION_REAL
274 3442752 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
275 : #endif
276 :
277 44410496 : __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
278 44410496 : __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
279 44410496 : __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
280 44410496 : __AVX512_DATATYPE x4 = _AVX512_LOAD(&q[ldq+3*offset]);
281 :
282 :
283 44410496 : __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
284 : __AVX512_DATATYPE h2;
285 :
286 22205248 : __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
287 22205248 : __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
288 44410496 : __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
289 22205248 : __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
290 44410496 : __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
291 22205248 : __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
292 44410496 : __AVX512_DATATYPE q4 = _AVX512_LOAD(&q[3*offset]);
293 22205248 : __AVX512_DATATYPE y4 = _AVX512_FMA(x4, h1, q4);
294 :
295 1387088064 : for(i = 2; i < nb; i++)
296 : {
297 2729765632 : h1 = _AVX512_SET1(hh[i-1]);
298 2729765632 : h2 = _AVX512_SET1(hh[ldh+i]);
299 :
300 2729765632 : q1 = _AVX512_LOAD(&q[i*ldq]);
301 1364882816 : x1 = _AVX512_FMA(q1, h1, x1);
302 1364882816 : y1 = _AVX512_FMA(q1, h2, y1);
303 2729765632 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
304 1364882816 : x2 = _AVX512_FMA(q2, h1, x2);
305 1364882816 : y2 = _AVX512_FMA(q2, h2, y2);
306 2729765632 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
307 1364882816 : x3 = _AVX512_FMA(q3, h1, x3);
308 1364882816 : y3 = _AVX512_FMA(q3, h2, y3);
309 2729765632 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
310 1364882816 : x4 = _AVX512_FMA(q4, h1, x4);
311 1364882816 : y4 = _AVX512_FMA(q4, h2, y4);
312 :
313 : }
314 :
315 44410496 : h1 = _AVX512_SET1(hh[nb-1]);
316 :
317 44410496 : q1 = _AVX512_LOAD(&q[nb*ldq]);
318 22205248 : x1 = _AVX512_FMA(q1, h1, x1);
319 44410496 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
320 22205248 : x2 = _AVX512_FMA(q2, h1, x2);
321 44410496 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
322 22205248 : x3 = _AVX512_FMA(q3, h1, x3);
323 44410496 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
324 22205248 : x4 = _AVX512_FMA(q4, h1, x4);
325 :
326 :
327 : /////////////////////////////////////////////////////
328 : // Rank-2 update of Q [24 x nb+1]
329 : /////////////////////////////////////////////////////
330 :
331 44410496 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
332 44410496 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
333 22205248 : __AVX512_DATATYPE vs = _AVX512_SET1(s);
334 :
335 : #ifdef DOUBLE_PRECISION_REAL
336 37524992 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
337 : #endif
338 : #ifdef SINGLE_PRECISION_REAL
339 6885504 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
340 : #endif
341 22205248 : x1 = _AVX512_MUL(x1, h1);
342 22205248 : x2 = _AVX512_MUL(x2, h1);
343 22205248 : x3 = _AVX512_MUL(x3, h1);
344 22205248 : x4 = _AVX512_MUL(x4, h1);
345 :
346 : // check ofr xor_pd on skylake
347 : #ifdef DOUBLE_PRECISION_REAL
348 37524992 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
349 : #endif
350 : #ifdef SINGLE_PRECISION_REAL
351 6885504 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
352 : #endif
353 22205248 : h2 = _AVX512_MUL(h1, vs);
354 44410496 : y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
355 44410496 : y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
356 44410496 : y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
357 44410496 : y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2));
358 :
359 22205248 : q1 = _AVX512_LOAD(q);
360 22205248 : q1 = _AVX512_ADD(q1, y1);
361 : _AVX512_STORE(q,q1);
362 44410496 : q2 = _AVX512_LOAD(&q[offset]);
363 22205248 : q2 = _AVX512_ADD(q2, y2);
364 22205248 : _AVX512_STORE(&q[offset],q2);
365 44410496 : q3 = _AVX512_LOAD(&q[2*offset]);
366 22205248 : q3 = _AVX512_ADD(q3, y3);
367 22205248 : _AVX512_STORE(&q[2*offset],q3);
368 44410496 : q4 = _AVX512_LOAD(&q[3*offset]);
369 22205248 : q4 = _AVX512_ADD(q4, y4);
370 22205248 : _AVX512_STORE(&q[3*offset],q4);
371 :
372 44410496 : h2 = _AVX512_SET1(hh[ldh+1]);
373 :
374 44410496 : q1 = _AVX512_LOAD(&q[ldq]);
375 44410496 : q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
376 22205248 : _AVX512_STORE(&q[ldq],q1);
377 44410496 : q2 = _AVX512_LOAD(&q[ldq+offset]);
378 44410496 : q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
379 22205248 : _AVX512_STORE(&q[ldq+offset],q2);
380 44410496 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
381 44410496 : q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
382 22205248 : _AVX512_STORE(&q[ldq+2*offset],q3);
383 44410496 : q4 = _AVX512_LOAD(&q[ldq+3*offset]);
384 44410496 : q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4));
385 22205248 : _AVX512_STORE(&q[ldq+3*offset],q4);
386 :
387 1387088064 : for (i = 2; i < nb; i++)
388 : {
389 2729765632 : h1 = _AVX512_SET1(hh[i-1]);
390 2729765632 : h2 = _AVX512_SET1(hh[ldh+i]);
391 :
392 2729765632 : q1 = _AVX512_LOAD(&q[i*ldq]);
393 1364882816 : q1 = _AVX512_FMA(x1, h1, q1);
394 1364882816 : q1 = _AVX512_FMA(y1, h2, q1);
395 1364882816 : _AVX512_STORE(&q[i*ldq],q1);
396 2729765632 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
397 1364882816 : q2 = _AVX512_FMA(x2, h1, q2);
398 1364882816 : q2 = _AVX512_FMA(y2, h2, q2);
399 1364882816 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
400 2729765632 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
401 1364882816 : q3 = _AVX512_FMA(x3, h1, q3);
402 1364882816 : q3 = _AVX512_FMA(y3, h2, q3);
403 1364882816 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
404 2729765632 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
405 1364882816 : q4 = _AVX512_FMA(x4, h1, q4);
406 1364882816 : q4 = _AVX512_FMA(y4, h2, q4);
407 1364882816 : _AVX512_STORE(&q[(i*ldq)+3*offset],q4);
408 :
409 : }
410 :
411 44410496 : h1 = _AVX512_SET1(hh[nb-1]);
412 :
413 44410496 : q1 = _AVX512_LOAD(&q[nb*ldq]);
414 22205248 : q1 = _AVX512_FMA(x1, h1, q1);
415 22205248 : _AVX512_STORE(&q[nb*ldq],q1);
416 44410496 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
417 22205248 : q2 = _AVX512_FMA(x2, h1, q2);
418 22205248 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
419 44410496 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
420 22205248 : q3 = _AVX512_FMA(x3, h1, q3);
421 22205248 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
422 44410496 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
423 22205248 : q4 = _AVX512_FMA(x4, h1, q4);
424 22205248 : _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
425 :
426 : }
427 :
428 : /**
429 : * Unrolled kernel that computes
430 : #ifdef DOUBLE_PRECISION_REAL
431 : * 24 rows of Q simultaneously, a
432 : #endif
433 : #ifdef SINGLE_PRECISION_REAL
434 : * 48 rows of Q simultaneously, a
435 : #endif
436 :
437 : * matrix Vector product with two householder
438 : * vectors + a rank 2 update is performed
439 : */
440 : #ifdef DOUBLE_PRECISION_REAL
441 : __forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
442 : #endif
443 : #ifdef SINGLE_PRECISION_REAL
444 : __forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
445 : #endif
446 : {
447 : /////////////////////////////////////////////////////
448 : // Matrix Vector Multiplication, Q [24 x nb+1] * hh
449 : // hh contains two householder vectors, with offset 1
450 : /////////////////////////////////////////////////////
451 : int i;
452 : // Needed bit mask for floating point sign flip
453 : #ifdef DOUBLE_PRECISION_REAL
454 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
455 : #endif
456 : #ifdef SINGLE_PRECISION_REAL
457 0 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
458 : #endif
459 0 : __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
460 0 : __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
461 0 : __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
462 :
463 0 : __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
464 : __AVX512_DATATYPE h2;
465 :
466 0 : __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
467 0 : __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
468 0 : __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
469 0 : __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
470 0 : __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
471 0 : __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
472 :
473 0 : for(i = 2; i < nb; i++)
474 : {
475 0 : h1 = _AVX512_SET1(hh[i-1]);
476 0 : h2 = _AVX512_SET1(hh[ldh+i]);
477 :
478 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
479 0 : x1 = _AVX512_FMA(q1, h1, x1);
480 0 : y1 = _AVX512_FMA(q1, h2, y1);
481 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
482 0 : x2 = _AVX512_FMA(q2, h1, x2);
483 0 : y2 = _AVX512_FMA(q2, h2, y2);
484 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
485 0 : x3 = _AVX512_FMA(q3, h1, x3);
486 0 : y3 = _AVX512_FMA(q3, h2, y3);
487 : }
488 :
489 0 : h1 = _AVX512_SET1(hh[nb-1]);
490 :
491 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
492 0 : x1 = _AVX512_FMA(q1, h1, x1);
493 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
494 0 : x2 = _AVX512_FMA(q2, h1, x2);
495 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
496 0 : x3 = _AVX512_FMA(q3, h1, x3);
497 :
498 : /////////////////////////////////////////////////////
499 : // Rank-2 update of Q [24 x nb+1]
500 : /////////////////////////////////////////////////////
501 :
502 0 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
503 0 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
504 0 : __AVX512_DATATYPE vs = _AVX512_SET1(s);
505 :
506 : // check for xor_pd on skylake
507 : #ifdef DOUBLE_PRECISION_REAL
508 0 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
509 : #endif
510 : #ifdef SINGLE_PRECISION_REAL
511 0 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
512 : #endif
513 0 : x1 = _AVX512_MUL(x1, h1);
514 0 : x2 = _AVX512_MUL(x2, h1);
515 0 : x3 = _AVX512_MUL(x3, h1);
516 :
517 : #ifdef DOUBLE_PRECISION_REAL
518 0 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
519 : #endif
520 : #ifdef SINGLE_PRECISION_REAL
521 0 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
522 : #endif
523 0 : h2 = _AVX512_MUL(h1, vs);
524 0 : y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
525 0 : y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
526 0 : y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
527 :
528 0 : q1 = _AVX512_LOAD(q);
529 0 : q1 = _AVX512_ADD(q1, y1);
530 : _AVX512_STORE(q,q1);
531 0 : q2 = _AVX512_LOAD(&q[offset]);
532 0 : q2 = _AVX512_ADD(q2, y2);
533 0 : _AVX512_STORE(&q[offset],q2);
534 0 : q3 = _AVX512_LOAD(&q[2*offset]);
535 0 : q3 = _AVX512_ADD(q3, y3);
536 0 : _AVX512_STORE(&q[2*offset],q3);
537 :
538 0 : h2 = _AVX512_SET1(hh[ldh+1]);
539 :
540 0 : q1 = _AVX512_LOAD(&q[ldq]);
541 0 : q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
542 0 : _AVX512_STORE(&q[ldq],q1);
543 0 : q2 = _AVX512_LOAD(&q[ldq+offset]);
544 0 : q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
545 0 : _AVX512_STORE(&q[ldq+offset],q2);
546 0 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
547 0 : q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
548 0 : _AVX512_STORE(&q[ldq+2*offset],q3);
549 :
550 0 : for (i = 2; i < nb; i++)
551 : {
552 0 : h1 = _AVX512_SET1(hh[i-1]);
553 0 : h2 = _AVX512_SET1(hh[ldh+i]);
554 :
555 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
556 0 : q1 = _AVX512_FMA(x1, h1, q1);
557 0 : q1 = _AVX512_FMA(y1, h2, q1);
558 0 : _AVX512_STORE(&q[i*ldq],q1);
559 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
560 0 : q2 = _AVX512_FMA(x2, h1, q2);
561 0 : q2 = _AVX512_FMA(y2, h2, q2);
562 0 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
563 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
564 0 : q3 = _AVX512_FMA(x3, h1, q3);
565 0 : q3 = _AVX512_FMA(y3, h2, q3);
566 0 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
567 :
568 : }
569 :
570 0 : h1 = _AVX512_SET1(hh[nb-1]);
571 :
572 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
573 0 : q1 = _AVX512_FMA(x1, h1, q1);
574 0 : _AVX512_STORE(&q[nb*ldq],q1);
575 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
576 0 : q2 = _AVX512_FMA(x2, h1, q2);
577 0 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
578 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
579 0 : q3 = _AVX512_FMA(x3, h1, q3);
580 0 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
581 :
582 : }
583 :
584 : /**
585 : * Unrolled kernel that computes
586 : #ifdef DOUBLE_PRECISION_REAL
587 : * 16 rows of Q simultaneously, a
588 : #endif
589 : #ifdef SINGLE_PRECISION_REAL
590 : * 32 rows of Q simultaneously, a
591 : #endif
592 : * matrix Vector product with two householder
593 : * vectors + a rank 2 update is performed
594 : */
595 : #ifdef DOUBLE_PRECISION_REAL
596 : __forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
597 : #endif
598 : #ifdef SINGLE_PRECISION_REAL
599 : __forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
600 : #endif
601 : {
602 : /////////////////////////////////////////////////////
603 : // Matrix Vector Multiplication, Q [16 x nb+1] * hh
604 : // hh contains two householder vectors, with offset 1
605 : /////////////////////////////////////////////////////
606 : int i;
607 : // Needed bit mask for floating point sign flip
608 : #ifdef DOUBLE_PRECISION_REAL
609 17571840 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
610 : #endif
611 : #ifdef SINGLE_PRECISION_REAL
612 3194880 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
613 : #endif
614 41533440 : __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
615 41533440 : __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
616 :
617 41533440 : __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
618 : __AVX512_DATATYPE h2;
619 :
620 20766720 : __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
621 20766720 : __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
622 41533440 : __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
623 20766720 : __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
624 :
625 1308303360 : for(i = 2; i < nb; i++)
626 : {
627 2575073280 : h1 = _AVX512_SET1(hh[i-1]);
628 2575073280 : h2 = _AVX512_SET1(hh[ldh+i]);
629 :
630 2575073280 : q1 = _AVX512_LOAD(&q[i*ldq]);
631 1287536640 : x1 = _AVX512_FMA(q1, h1, x1);
632 1287536640 : y1 = _AVX512_FMA(q1, h2, y1);
633 2575073280 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
634 1287536640 : x2 = _AVX512_FMA(q2, h1, x2);
635 1287536640 : y2 = _AVX512_FMA(q2, h2, y2);
636 : }
637 :
638 41533440 : h1 = _AVX512_SET1(hh[nb-1]);
639 :
640 41533440 : q1 = _AVX512_LOAD(&q[nb*ldq]);
641 20766720 : x1 = _AVX512_FMA(q1, h1, x1);
642 41533440 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
643 20766720 : x2 = _AVX512_FMA(q2, h1, x2);
644 :
645 : /////////////////////////////////////////////////////
646 : // Rank-2 update of Q [16 x nb+1]
647 : /////////////////////////////////////////////////////
648 :
649 41533440 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
650 41533440 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
651 20766720 : __AVX512_DATATYPE vs = _AVX512_SET1(s);
652 : // check for xor_pd on skylake
653 : #ifdef DOUBLE_PRECISION_REAL
654 35143680 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
655 : #endif
656 : #ifdef SINGLE_PRECISION_REAL
657 6389760 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
658 : #endif
659 20766720 : x1 = _AVX512_MUL(x1, h1);
660 20766720 : x2 = _AVX512_MUL(x2, h1);
661 : #ifdef DOUBLE_PRECISION_REAL
662 35143680 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
663 : #endif
664 : #ifdef SINGLE_PRECISION_REAL
665 6389760 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
666 : #endif
667 20766720 : h2 = _AVX512_MUL(h1, vs);
668 :
669 41533440 : y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
670 41533440 : y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
671 :
672 20766720 : q1 = _AVX512_LOAD(q);
673 20766720 : q1 = _AVX512_ADD(q1, y1);
674 : _AVX512_STORE(q,q1);
675 41533440 : q2 = _AVX512_LOAD(&q[offset]);
676 20766720 : q2 = _AVX512_ADD(q2, y2);
677 20766720 : _AVX512_STORE(&q[offset],q2);
678 :
679 41533440 : h2 = _AVX512_SET1(hh[ldh+1]);
680 :
681 41533440 : q1 = _AVX512_LOAD(&q[ldq]);
682 41533440 : q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
683 20766720 : _AVX512_STORE(&q[ldq],q1);
684 41533440 : q2 = _AVX512_LOAD(&q[ldq+offset]);
685 41533440 : q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
686 20766720 : _AVX512_STORE(&q[ldq+offset],q2);
687 :
688 1308303360 : for (i = 2; i < nb; i++)
689 : {
690 2575073280 : h1 = _AVX512_SET1(hh[i-1]);
691 2575073280 : h2 = _AVX512_SET1(hh[ldh+i]);
692 :
693 2575073280 : q1 = _AVX512_LOAD(&q[i*ldq]);
694 1287536640 : q1 = _AVX512_FMA(x1, h1, q1);
695 1287536640 : q1 = _AVX512_FMA(y1, h2, q1);
696 1287536640 : _AVX512_STORE(&q[i*ldq],q1);
697 2575073280 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
698 1287536640 : q2 = _AVX512_FMA(x2, h1, q2);
699 1287536640 : q2 = _AVX512_FMA(y2, h2, q2);
700 1287536640 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
701 : }
702 :
703 41533440 : h1 = _AVX512_SET1(hh[nb-1]);
704 :
705 41533440 : q1 = _AVX512_LOAD(&q[nb*ldq]);
706 20766720 : q1 = _AVX512_FMA(x1, h1, q1);
707 20766720 : _AVX512_STORE(&q[nb*ldq],q1);
708 41533440 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
709 20766720 : q2 = _AVX512_FMA(x2, h1, q2);
710 20766720 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
711 :
712 : }
713 :
714 : /**
715 : * Unrolled kernel that computes
716 : #ifdef DOUBLE_PRECISION_REAL
717 : * 8 rows of Q simultaneously, a
718 : #endif
719 : #ifdef SINGLE_PRECISION_REAL
720 : * 16 rows of Q simultaneously, a
721 : #endif
722 : * matrix Vector product with two householder
723 : * vectors + a rank 2 update is performed
724 : */
725 : #ifdef DOUBLE_PRECISION_REAL
726 : __forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
727 : #endif
728 : #ifdef SINGLE_PRECISION_REAL
729 : __forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
730 : #endif
731 : {
732 : /////////////////////////////////////////////////////
733 : // Matrix Vector Multiplication, Q [8 x nb+1] * hh
734 : // hh contains two householder vectors, with offset 1
735 : /////////////////////////////////////////////////////
736 : int i;
737 : // Needed bit mask for floating point sign flip
738 : #ifdef DOUBLE_PRECISION_REAL
739 1190656 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
740 : #endif
741 : #ifdef SINGLE_PRECISION_REAL
742 247872 : __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
743 : #endif
744 2877056 : __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
745 :
746 2877056 : __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
747 : __AVX512_DATATYPE h2;
748 :
749 1438528 : __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
750 1438528 : __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
751 :
752 78784704 : for(i = 2; i < nb; i++)
753 : {
754 154692352 : h1 = _AVX512_SET1(hh[i-1]);
755 154692352 : h2 = _AVX512_SET1(hh[ldh+i]);
756 :
757 154692352 : q1 = _AVX512_LOAD(&q[i*ldq]);
758 77346176 : x1 = _AVX512_FMA(q1, h1, x1);
759 77346176 : y1 = _AVX512_FMA(q1, h2, y1);
760 : }
761 :
762 2877056 : h1 = _AVX512_SET1(hh[nb-1]);
763 :
764 2877056 : q1 = _AVX512_LOAD(&q[nb*ldq]);
765 1438528 : x1 = _AVX512_FMA(q1, h1, x1);
766 :
767 : /////////////////////////////////////////////////////
768 : // Rank-2 update of Q [8 x nb+1]
769 : /////////////////////////////////////////////////////
770 :
771 2877056 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
772 2877056 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
773 1438528 : __AVX512_DATATYPE vs = _AVX512_SET1(s);
774 :
775 : #ifdef DOUBLE_PRECISION_REAL
776 2381312 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
777 : #endif
778 : #ifdef SINGLE_PRECISION_REAL
779 495744 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
780 : #endif
781 :
782 1438528 : x1 = _AVX512_MUL(x1, h1);
783 :
784 : #ifdef DOUBLE_PRECISION_REAL
785 2381312 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
786 : #endif
787 : #ifdef SINGLE_PRECISION_REAL
788 495744 : h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
789 : #endif
790 :
791 1438528 : h2 = _AVX512_MUL(h1, vs);
792 :
793 2877056 : y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
794 :
795 1438528 : q1 = _AVX512_LOAD(q);
796 1438528 : q1 = _AVX512_ADD(q1, y1);
797 : _AVX512_STORE(q,q1);
798 :
799 2877056 : h2 = _AVX512_SET1(hh[ldh+1]);
800 :
801 2877056 : q1 = _AVX512_LOAD(&q[ldq]);
802 2877056 : q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
803 1438528 : _AVX512_STORE(&q[ldq],q1);
804 :
805 78784704 : for (i = 2; i < nb; i++)
806 : {
807 154692352 : h1 = _AVX512_SET1(hh[i-1]);
808 154692352 : h2 = _AVX512_SET1(hh[ldh+i]);
809 :
810 154692352 : q1 = _AVX512_LOAD(&q[i*ldq]);
811 77346176 : q1 = _AVX512_FMA(x1, h1, q1);
812 77346176 : q1 = _AVX512_FMA(y1, h2, q1);
813 77346176 : _AVX512_STORE(&q[i*ldq],q1);
814 : }
815 :
816 2877056 : h1 = _AVX512_SET1(hh[nb-1]);
817 :
818 2877056 : q1 = _AVX512_LOAD(&q[nb*ldq]);
819 1438528 : q1 = _AVX512_FMA(x1, h1, q1);
820 1438528 : _AVX512_STORE(&q[nb*ldq],q1);
821 :
822 : }
823 :
|