Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46 : // --------------------------------------------------------------------------------------------------
47 : #include "config-f90.h"
48 : #include <x86intrin.h>
49 : #include <stdio.h>
50 : #include <stdlib.h>
51 :
52 : #define __forceinline __attribute__((always_inline)) static
53 :
54 : #ifdef DOUBLE_PRECISION_REAL
55 : #define offset 8
56 : #define __AVX512_DATATYPE __m512d
57 : #define _AVX512_LOAD _mm512_load_pd
58 : #define _AVX512_STORE _mm512_store_pd
59 : #define _AVX512_SET1 _mm512_set1_pd
60 : #define _AVX512_MUL _mm512_mul_pd
61 : #define _AVX512_SUB _mm512_sub_pd
62 :
63 : #ifdef HAVE_AVX512
64 : #define __ELPA_USE_FMA__
65 : #define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
66 : #define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
67 : #define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)
68 : #endif
69 :
70 : #define _AVX512_FMA _mm512_FMA_pd
71 : #define _AVX512_NFMA _mm512_NFMA_pd
72 : #define _AVX512_FMSUB _mm512_FMSUB_pd
73 : #endif /* DOUBLE_PRECISION_REAL */
74 :
75 : #ifdef SINGLE_PRECISION_REAL
76 : #define offset 16
77 : #define __AVX512_DATATYPE __m512
78 : #define _AVX512_LOAD _mm512_load_ps
79 : #define _AVX512_STORE _mm512_store_ps
80 : #define _AVX512_SET1 _mm512_set1_ps
81 : #define _AVX512_MUL _mm512_mul_ps
82 : #define _AVX512_SUB _mm512_sub_ps
83 :
84 : #ifdef HAVE_AVX512
85 : #define __ELPA_USE_FMA__
86 : #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
87 : #define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
88 : #define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
89 : #endif
90 :
91 : #define _AVX512_FMA _mm512_FMA_ps
92 : #define _AVX512_NFMA _mm512_NFMA_ps
93 : #define _AVX512_FMSUB _mm512_FMSUB_ps
94 : #endif /* SINGLE_PRECISION_REAL */
95 :
96 : #ifdef DOUBLE_PRECISION_REAL
97 : //Forward declaration
98 : __forceinline void hh_trafo_kernel_8_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
99 : __forceinline void hh_trafo_kernel_16_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
100 : __forceinline void hh_trafo_kernel_24_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
101 : __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
102 :
103 :
104 : void quad_hh_trafo_real_avx512_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
105 : #endif
106 : #ifdef SINGLE_PRECISION_REAL
107 : //Forward declaration
108 : __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
109 : __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
110 : __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
111 : __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
112 :
113 : void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
114 : #endif
115 :
116 :
117 : #ifdef DOUBLE_PRECISION_REAL
118 : /*
119 : !f>#if defined(HAVE_AVX512)
120 : !f> interface
121 : !f> subroutine quad_hh_trafo_real_avx512_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
122 : !f> bind(C, name="quad_hh_trafo_real_avx512_4hv_double")
123 : !f> use, intrinsic :: iso_c_binding
124 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
125 : !f> type(c_ptr), value :: q
126 : !f> real(kind=c_double) :: hh(pnb,6)
127 : !f> end subroutine
128 : !f> end interface
129 : !f>#endif
130 : */
131 : #endif
132 : /*
133 : !f>#if defined(HAVE_AVX512)
134 : !f> interface
135 : !f> subroutine quad_hh_trafo_real_avx512_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
136 : !f> bind(C, name="quad_hh_trafo_real_avx512_4hv_single")
137 : !f> use, intrinsic :: iso_c_binding
138 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
139 : !f> type(c_ptr), value :: q
140 : !f> real(kind=c_float) :: hh(pnb,6)
141 : !f> end subroutine
142 : !f> end interface
143 : !f>#endif
144 : */
145 : #ifdef SINGLE_PRECISION_REAL
146 :
147 : #endif
148 :
149 : #ifdef DOUBLE_PRECISION_REAL
150 134144 : void quad_hh_trafo_real_avx512_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
151 : #endif
152 : #ifdef SINGLE_PRECISION_REAL
153 25152 : void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
154 : #endif
155 : {
156 : int i;
157 159296 : int nb = *pnb;
158 159296 : int nq = *pldq;
159 159296 : int ldq = *pldq;
160 159296 : int ldh = *pldh;
161 : int worked_on;
162 :
163 159296 : worked_on = 0;
164 :
165 : // calculating scalar products to compute
166 : // 4 householder vectors simultaneously
167 : #ifdef DOUBLE_PRECISION_REAL
168 134144 : double s_1_2 = hh[(ldh)+1];
169 134144 : double s_1_3 = hh[(ldh*2)+2];
170 134144 : double s_2_3 = hh[(ldh*2)+1];
171 134144 : double s_1_4 = hh[(ldh*3)+3];
172 134144 : double s_2_4 = hh[(ldh*3)+2];
173 134144 : double s_3_4 = hh[(ldh*3)+1];
174 : #endif
175 : #ifdef SINGLE_PRECISION_REAL
176 25152 : float s_1_2 = hh[(ldh)+1];
177 25152 : float s_1_3 = hh[(ldh*2)+2];
178 25152 : float s_2_3 = hh[(ldh*2)+1];
179 25152 : float s_1_4 = hh[(ldh*3)+3];
180 25152 : float s_2_4 = hh[(ldh*3)+2];
181 25152 : float s_3_4 = hh[(ldh*3)+1];
182 : #endif
183 :
184 :
185 : // calculate scalar product of first and fourth householder Vector
186 : // loop counter = 2
187 159296 : s_1_2 += hh[2-1] * hh[(2+ldh)];
188 159296 : s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
189 159296 : s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
190 :
191 : // loop counter = 3
192 159296 : s_1_2 += hh[3-1] * hh[(3+ldh)];
193 159296 : s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
194 159296 : s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
195 :
196 159296 : s_1_3 += hh[3-2] * hh[3+(ldh*2)];
197 159296 : s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
198 :
199 : #pragma ivdep
200 9717056 : for (i = 4; i < nb; i++)
201 : {
202 9557760 : s_1_2 += hh[i-1] * hh[(i+ldh)];
203 9557760 : s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
204 9557760 : s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
205 :
206 9557760 : s_1_3 += hh[i-2] * hh[i+(ldh*2)];
207 9557760 : s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
208 :
209 9557760 : s_1_4 += hh[i-3] * hh[i+(ldh*3)];
210 : }
211 :
212 : // Production level kernel calls with padding
213 : #ifdef DOUBLE_PRECISION_REAL
214 268288 : for (i = 0; i < nq-24; i+=32)
215 : {
216 134144 : hh_trafo_kernel_32_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
217 134144 : worked_on += 32;
218 : }
219 : #endif
220 : #ifdef SINGLE_PRECISION_REAL
221 50304 : for (i = 0; i < nq-48; i+=64)
222 : {
223 25152 : hh_trafo_kernel_64_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
224 25152 : worked_on += 64;
225 : }
226 : #endif
227 159296 : if (nq == i)
228 : {
229 0 : return;
230 : }
231 : #ifdef DOUBLE_PRECISION_REAL
232 134144 : if (nq-i == 24)
233 : {
234 0 : hh_trafo_kernel_24_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
235 0 : worked_on += 24;
236 : }
237 : #endif
238 :
239 : #ifdef SINGLE_PRECISION_REAL
240 25152 : if (nq-i == 48)
241 : {
242 0 : hh_trafo_kernel_48_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
243 0 : worked_on += 48;
244 : }
245 : #endif
246 :
247 : #ifdef DOUBLE_PRECISION_REAL
248 134144 : if (nq-i == 16)
249 : {
250 0 : hh_trafo_kernel_16_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
251 0 : worked_on += 16;
252 : }
253 : #endif
254 :
255 : #ifdef SINGLE_PRECISION_REAL
256 25152 : if (nq-i == 32)
257 : {
258 0 : hh_trafo_kernel_32_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
259 0 : worked_on += 32;
260 : }
261 : #endif
262 :
263 : #ifdef DOUBLE_PRECISION_REAL
264 134144 : if (nq-i == 8)
265 : {
266 134144 : hh_trafo_kernel_8_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
267 134144 : worked_on += 8;
268 : }
269 : #endif
270 :
271 : #ifdef SINGLE_PRECISION_REAL
272 25152 : if (nq-i == 16)
273 : {
274 25152 : hh_trafo_kernel_16_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
275 25152 : worked_on += 16;
276 : }
277 : #endif
278 :
279 : #ifdef WITH_DEBUG
280 : if (worked_on != nq)
281 : {
282 : printf("Error in AVX512 real BLOCK 2 kernel \n");
283 : abort();
284 : }
285 : #endif
286 : }
287 :
288 : /**
289 : * Unrolled kernel that computes
290 :
291 : #ifdef DOUBLE_PRECISION_REAL
292 : * 32 rows of Q simultaneously, a
293 : #endif
294 : #ifdef SINGLE_PRECISION_REAL
295 : * 64 rows of Q simultaneously, a
296 : #endif
297 : * matrix Vector product with two householder
298 : * vectors + a rank 1 update is performed
299 : */
300 :
301 : #ifdef DOUBLE_PRECISION_REAL
302 : __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
303 : #endif
304 : #ifdef SINGLE_PRECISION_REAL
305 : __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
306 : #endif
307 :
308 : {
309 : /////////////////////////////////////////////////////
310 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
311 : // hh contains four householder vectors
312 : /////////////////////////////////////////////////////
313 : int i;
314 :
315 318592 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*3]);
316 318592 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*2]);
317 318592 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq]);
318 159296 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[0]);
319 :
320 318592 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
321 318592 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
322 318592 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[ldq+offset]);
323 318592 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[0+offset]);
324 :
325 318592 : __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
326 318592 : __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
327 318592 : __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[ldq+2*offset]);
328 318592 : __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[0+2*offset]);
329 :
330 318592 : __AVX512_DATATYPE a1_4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
331 318592 : __AVX512_DATATYPE a2_4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
332 318592 : __AVX512_DATATYPE a3_4 = _AVX512_LOAD(&q[ldq+3*offset]);
333 318592 : __AVX512_DATATYPE a4_4 = _AVX512_LOAD(&q[0+3*offset]);
334 :
335 :
336 318592 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
337 318592 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
338 318592 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
339 318592 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
340 318592 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
341 318592 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
342 :
343 159296 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
344 159296 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
345 159296 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
346 159296 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
347 159296 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
348 159296 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
349 159296 : __AVX512_DATATYPE x1 = a1_1;
350 :
351 159296 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
352 159296 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
353 159296 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
354 159296 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
355 159296 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
356 159296 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
357 159296 : __AVX512_DATATYPE x2 = a1_2;
358 :
359 159296 : __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
360 159296 : w3 = _AVX512_FMA(a2_3, h_4_2, w3);
361 159296 : w3 = _AVX512_FMA(a1_3, h_4_1, w3);
362 159296 : __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
363 159296 : z3 = _AVX512_FMA(a1_3, h_3_1, z3);
364 159296 : __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
365 159296 : __AVX512_DATATYPE x3 = a1_3;
366 :
367 159296 : __AVX512_DATATYPE w4 = _AVX512_FMA(a3_4, h_4_3, a4_4);
368 159296 : w4 = _AVX512_FMA(a2_4, h_4_2, w4);
369 159296 : w4 = _AVX512_FMA(a1_4, h_4_1, w4);
370 159296 : __AVX512_DATATYPE z4 = _AVX512_FMA(a2_4, h_3_2, a3_4);
371 159296 : z4 = _AVX512_FMA(a1_4, h_3_1, z4);
372 159296 : __AVX512_DATATYPE y4 = _AVX512_FMA(a1_4, h_2_1, a2_4);
373 159296 : __AVX512_DATATYPE x4 = a1_4;
374 :
375 :
376 : __AVX512_DATATYPE q1;
377 : __AVX512_DATATYPE q2;
378 : __AVX512_DATATYPE q3;
379 : __AVX512_DATATYPE q4;
380 :
381 : __AVX512_DATATYPE h1;
382 : __AVX512_DATATYPE h2;
383 : __AVX512_DATATYPE h3;
384 : __AVX512_DATATYPE h4;
385 :
386 9717056 : for(i = 4; i < nb; i++)
387 : {
388 19115520 : h1 = _AVX512_SET1(hh[i-3]);
389 19115520 : h2 = _AVX512_SET1(hh[ldh+i-2]);
390 19115520 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
391 19115520 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
392 :
393 19115520 : q1 = _AVX512_LOAD(&q[i*ldq]);
394 19115520 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
395 19115520 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
396 19115520 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
397 :
398 9557760 : x1 = _AVX512_FMA(q1, h1, x1);
399 9557760 : y1 = _AVX512_FMA(q1, h2, y1);
400 9557760 : z1 = _AVX512_FMA(q1, h3, z1);
401 9557760 : w1 = _AVX512_FMA(q1, h4, w1);
402 :
403 9557760 : x2 = _AVX512_FMA(q2, h1, x2);
404 9557760 : y2 = _AVX512_FMA(q2, h2, y2);
405 9557760 : z2 = _AVX512_FMA(q2, h3, z2);
406 9557760 : w2 = _AVX512_FMA(q2, h4, w2);
407 :
408 9557760 : x3 = _AVX512_FMA(q3, h1, x3);
409 9557760 : y3 = _AVX512_FMA(q3, h2, y3);
410 9557760 : z3 = _AVX512_FMA(q3, h3, z3);
411 9557760 : w3 = _AVX512_FMA(q3, h4, w3);
412 :
413 9557760 : x4 = _AVX512_FMA(q4, h1, x4);
414 9557760 : y4 = _AVX512_FMA(q4, h2, y4);
415 9557760 : z4 = _AVX512_FMA(q4, h3, z4);
416 9557760 : w4 = _AVX512_FMA(q4, h4, w4);
417 :
418 : }
419 :
420 318592 : h1 = _AVX512_SET1(hh[nb-3]);
421 318592 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
422 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
423 :
424 318592 : q1 = _AVX512_LOAD(&q[nb*ldq]);
425 318592 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
426 318592 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
427 318592 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
428 :
429 159296 : x1 = _AVX512_FMA(q1, h1, x1);
430 159296 : y1 = _AVX512_FMA(q1, h2, y1);
431 159296 : z1 = _AVX512_FMA(q1, h3, z1);
432 :
433 159296 : x2 = _AVX512_FMA(q2, h1, x2);
434 159296 : y2 = _AVX512_FMA(q2, h2, y2);
435 159296 : z2 = _AVX512_FMA(q2, h3, z2);
436 :
437 159296 : x3 = _AVX512_FMA(q3, h1, x3);
438 159296 : y3 = _AVX512_FMA(q3, h2, y3);
439 159296 : z3 = _AVX512_FMA(q3, h3, z3);
440 :
441 159296 : x4 = _AVX512_FMA(q4, h1, x4);
442 159296 : y4 = _AVX512_FMA(q4, h2, y4);
443 159296 : z4 = _AVX512_FMA(q4, h3, z4);
444 :
445 :
446 318592 : h1 = _AVX512_SET1(hh[nb-2]);
447 318592 : h2 = _AVX512_SET1(hh[(ldh*1)+nb-1]);
448 :
449 318592 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
450 318592 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
451 318592 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
452 318592 : q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
453 :
454 159296 : x1 = _AVX512_FMA(q1, h1, x1);
455 159296 : y1 = _AVX512_FMA(q1, h2, y1);
456 159296 : x2 = _AVX512_FMA(q2, h1, x2);
457 159296 : y2 = _AVX512_FMA(q2, h2, y2);
458 159296 : x3 = _AVX512_FMA(q3, h1, x3);
459 159296 : y3 = _AVX512_FMA(q3, h2, y3);
460 159296 : x4 = _AVX512_FMA(q4, h1, x4);
461 159296 : y4 = _AVX512_FMA(q4, h2, y4);
462 :
463 318592 : h1 = _AVX512_SET1(hh[nb-1]);
464 :
465 318592 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
466 318592 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
467 318592 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
468 318592 : q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
469 :
470 159296 : x1 = _AVX512_FMA(q1, h1, x1);
471 159296 : x2 = _AVX512_FMA(q2, h1, x2);
472 159296 : x3 = _AVX512_FMA(q3, h1, x3);
473 159296 : x4 = _AVX512_FMA(q4, h1, x4);
474 :
475 : /////////////////////////////////////////////////////
476 : // Rank-1 update of Q [8 x nb+3]
477 : /////////////////////////////////////////////////////
478 :
479 318592 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
480 318592 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
481 318592 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
482 318592 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
483 :
484 159296 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(s_1_2);
485 159296 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(s_1_3);
486 159296 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(s_2_3);
487 159296 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(s_1_4);
488 159296 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(s_2_4);
489 159296 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(s_3_4);
490 :
491 159296 : h1 = tau1;
492 159296 : x1 = _AVX512_MUL(x1, h1);
493 159296 : x2 = _AVX512_MUL(x2, h1);
494 159296 : x3 = _AVX512_MUL(x3, h1);
495 159296 : x4 = _AVX512_MUL(x4, h1);
496 :
497 159296 : h1 = tau2;
498 159296 : h2 = _AVX512_MUL(h1, vs_1_2);
499 :
500 318592 : y1 = _AVX512_FMSUB(y1, h1, _AVX512_MUL(x1,h2));
501 318592 : y2 = _AVX512_FMSUB(y2, h1, _AVX512_MUL(x2,h2));
502 318592 : y3 = _AVX512_FMSUB(y3, h1, _AVX512_MUL(x3,h2));
503 318592 : y4 = _AVX512_FMSUB(y4, h1, _AVX512_MUL(x4,h2));
504 :
505 159296 : h1 = tau3;
506 159296 : h2 = _AVX512_MUL(h1, vs_1_3);
507 159296 : h3 = _AVX512_MUL(h1, vs_2_3);
508 :
509 477888 : z1 = _AVX512_FMSUB(z1, h1, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
510 477888 : z2 = _AVX512_FMSUB(z2, h1, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
511 477888 : z3 = _AVX512_FMSUB(z3, h1, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
512 477888 : z4 = _AVX512_FMSUB(z4, h1, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2)));
513 :
514 159296 : h1 = tau4;
515 159296 : h2 = _AVX512_MUL(h1, vs_1_4);
516 159296 : h3 = _AVX512_MUL(h1, vs_2_4);
517 159296 : h4 = _AVX512_MUL(h1, vs_3_4);
518 :
519 637184 : w1 = _AVX512_FMSUB(w1, h1, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
520 637184 : w2 = _AVX512_FMSUB(w2, h1, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
521 637184 : w3 = _AVX512_FMSUB(w3, h1, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
522 637184 : w4 = _AVX512_FMSUB(w4, h1, _AVX512_FMA(z4, h4, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2))));
523 :
524 159296 : q1 = _AVX512_LOAD(&q[0]);
525 159296 : q1 = _AVX512_SUB(q1, w1);
526 : _AVX512_STORE(&q[0],q1);
527 :
528 318592 : q2 = _AVX512_LOAD(&q[0+offset]);
529 159296 : q2 = _AVX512_SUB(q2, w2);
530 159296 : _AVX512_STORE(&q[0+offset],q2);
531 :
532 318592 : q3 = _AVX512_LOAD(&q[0+2*offset]);
533 159296 : q3 = _AVX512_SUB(q3, w3);
534 159296 : _AVX512_STORE(&q[0+2*offset],q3);
535 :
536 318592 : q4 = _AVX512_LOAD(&q[0+3*offset]);
537 159296 : q4 = _AVX512_SUB(q4, w4);
538 159296 : _AVX512_STORE(&q[0+3*offset],q4);
539 :
540 :
541 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
542 318592 : q1 = _AVX512_LOAD(&q[ldq]);
543 318592 : q2 = _AVX512_LOAD(&q[ldq+offset]);
544 318592 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
545 318592 : q4 = _AVX512_LOAD(&q[ldq+3*offset]);
546 :
547 318592 : q1 = _AVX512_SUB(q1, _AVX512_FMA(w1, h4, z1));
548 159296 : _AVX512_STORE(&q[ldq],q1);
549 318592 : q2 = _AVX512_SUB(q2, _AVX512_FMA(w2, h4, z2));
550 159296 : _AVX512_STORE(&q[ldq+offset],q2);
551 318592 : q3 = _AVX512_SUB(q3, _AVX512_FMA(w3, h4, z3));
552 159296 : _AVX512_STORE(&q[ldq+2*offset],q3);
553 318592 : q4 = _AVX512_SUB(q4, _AVX512_FMA(w4, h4, z4));
554 159296 : _AVX512_STORE(&q[ldq+3*offset],q4);
555 :
556 :
557 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
558 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
559 318592 : q1 = _AVX512_LOAD(&q[ldq*2]);
560 318592 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
561 318592 : q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
562 318592 : q4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
563 :
564 159296 : q1 = _AVX512_SUB(q1, y1);
565 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
566 159296 : q1 = _AVX512_NFMA(w1, h4, q1);
567 159296 : _AVX512_STORE(&q[ldq*2],q1);
568 :
569 159296 : q2 = _AVX512_SUB(q2, y2);
570 159296 : q2 = _AVX512_NFMA(z2, h3, q2);
571 159296 : q2 = _AVX512_NFMA(w2, h4, q2);
572 159296 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
573 :
574 159296 : q3 = _AVX512_SUB(q3, y3);
575 159296 : q3 = _AVX512_NFMA(z3, h3, q3);
576 159296 : q3 = _AVX512_NFMA(w3, h4, q3);
577 159296 : _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
578 :
579 159296 : q4 = _AVX512_SUB(q4, y4);
580 159296 : q4 = _AVX512_NFMA(z4, h3, q4);
581 159296 : q4 = _AVX512_NFMA(w4, h4, q4);
582 159296 : _AVX512_STORE(&q[(ldq*2)+3*offset],q4);
583 :
584 :
585 318592 : h2 = _AVX512_SET1(hh[ldh+1]);
586 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
587 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
588 :
589 318592 : q1 = _AVX512_LOAD(&q[ldq*3]);
590 :
591 159296 : q1 = _AVX512_SUB(q1, x1);
592 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
593 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
594 159296 : q1 = _AVX512_NFMA(w1, h4, q1);
595 159296 : _AVX512_STORE(&q[ldq*3], q1);
596 :
597 318592 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
598 :
599 159296 : q2 = _AVX512_SUB(q2, x2);
600 159296 : q2 = _AVX512_NFMA(y2, h2, q2);
601 159296 : q2 = _AVX512_NFMA(z2, h3, q2);
602 159296 : q2 = _AVX512_NFMA(w2, h4, q2);
603 159296 : _AVX512_STORE(&q[(ldq*3)+offset], q2);
604 :
605 318592 : q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
606 :
607 159296 : q3 = _AVX512_SUB(q3, x3);
608 159296 : q3 = _AVX512_NFMA(y3, h2, q3);
609 159296 : q3 = _AVX512_NFMA(z3, h3, q3);
610 159296 : q3 = _AVX512_NFMA(w3, h4, q3);
611 159296 : _AVX512_STORE(&q[(ldq*3)+2*offset], q3);
612 :
613 318592 : q4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
614 :
615 159296 : q4 = _AVX512_SUB(q4, x4);
616 159296 : q4 = _AVX512_NFMA(y4, h2, q4);
617 159296 : q4 = _AVX512_NFMA(z4, h3, q4);
618 159296 : q4 = _AVX512_NFMA(w4, h4, q4);
619 159296 : _AVX512_STORE(&q[(ldq*3)+3*offset], q4);
620 :
621 9717056 : for (i = 4; i < nb; i++)
622 : {
623 19115520 : h1 = _AVX512_SET1(hh[i-3]);
624 19115520 : h2 = _AVX512_SET1(hh[ldh+i-2]);
625 19115520 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
626 19115520 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
627 :
628 19115520 : q1 = _AVX512_LOAD(&q[i*ldq]);
629 9557760 : q1 = _AVX512_NFMA(x1, h1, q1);
630 9557760 : q1 = _AVX512_NFMA(y1, h2, q1);
631 9557760 : q1 = _AVX512_NFMA(z1, h3, q1);
632 9557760 : q1 = _AVX512_NFMA(w1, h4, q1);
633 9557760 : _AVX512_STORE(&q[i*ldq],q1);
634 :
635 19115520 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
636 9557760 : q2 = _AVX512_NFMA(x2, h1, q2);
637 9557760 : q2 = _AVX512_NFMA(y2, h2, q2);
638 9557760 : q2 = _AVX512_NFMA(z2, h3, q2);
639 9557760 : q2 = _AVX512_NFMA(w2, h4, q2);
640 9557760 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
641 :
642 19115520 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
643 9557760 : q3 = _AVX512_NFMA(x3, h1, q3);
644 9557760 : q3 = _AVX512_NFMA(y3, h2, q3);
645 9557760 : q3 = _AVX512_NFMA(z3, h3, q3);
646 9557760 : q3 = _AVX512_NFMA(w3, h4, q3);
647 9557760 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
648 :
649 19115520 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
650 9557760 : q4 = _AVX512_NFMA(x4, h1, q4);
651 9557760 : q4 = _AVX512_NFMA(y4, h2, q4);
652 9557760 : q4 = _AVX512_NFMA(z4, h3, q4);
653 9557760 : q4 = _AVX512_NFMA(w4, h4, q4);
654 9557760 : _AVX512_STORE(&q[(i*ldq)+3*offset],q4);
655 :
656 : }
657 :
658 318592 : h1 = _AVX512_SET1(hh[nb-3]);
659 318592 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
660 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
661 318592 : q1 = _AVX512_LOAD(&q[nb*ldq]);
662 :
663 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
664 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
665 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
666 159296 : _AVX512_STORE(&q[nb*ldq],q1);
667 :
668 318592 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
669 :
670 159296 : q2 = _AVX512_NFMA(x2, h1, q2);
671 159296 : q2 = _AVX512_NFMA(y2, h2, q2);
672 159296 : q2 = _AVX512_NFMA(z2, h3, q2);
673 159296 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
674 :
675 318592 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
676 :
677 159296 : q3 = _AVX512_NFMA(x3, h1, q3);
678 159296 : q3 = _AVX512_NFMA(y3, h2, q3);
679 159296 : q3 = _AVX512_NFMA(z3, h3, q3);
680 159296 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
681 :
682 318592 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
683 :
684 159296 : q4 = _AVX512_NFMA(x4, h1, q4);
685 159296 : q4 = _AVX512_NFMA(y4, h2, q4);
686 159296 : q4 = _AVX512_NFMA(z4, h3, q4);
687 159296 : _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
688 :
689 318592 : h1 = _AVX512_SET1(hh[nb-2]);
690 318592 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
691 318592 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
692 :
693 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
694 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
695 159296 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
696 :
697 318592 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
698 :
699 159296 : q2 = _AVX512_NFMA(x2, h1, q2);
700 159296 : q2 = _AVX512_NFMA(y2, h2, q2);
701 159296 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
702 :
703 318592 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
704 :
705 159296 : q3 = _AVX512_NFMA(x3, h1, q3);
706 159296 : q3 = _AVX512_NFMA(y3, h2, q3);
707 159296 : _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
708 :
709 318592 : q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
710 :
711 159296 : q4 = _AVX512_NFMA(x4, h1, q4);
712 159296 : q4 = _AVX512_NFMA(y4, h2, q4);
713 159296 : _AVX512_STORE(&q[((nb+1)*ldq)+3*offset],q4);
714 :
715 :
716 318592 : h1 = _AVX512_SET1(hh[nb-1]);
717 318592 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
718 :
719 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
720 159296 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
721 :
722 318592 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
723 :
724 159296 : q2 = _AVX512_NFMA(x2, h1, q2);
725 159296 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
726 :
727 318592 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
728 :
729 159296 : q3 = _AVX512_NFMA(x3, h1, q3);
730 159296 : _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
731 :
732 318592 : q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
733 :
734 159296 : q4 = _AVX512_NFMA(x4, h1, q4);
735 159296 : _AVX512_STORE(&q[((nb+2)*ldq)+3*offset],q4);
736 :
737 : }
738 :
739 : /**
740 : * Unrolled kernel that computes
741 : #ifdef DOUBLE_PRECISION_REAL
742 : * 24 rows of Q simultaneously, a
743 : #endif
744 : #ifdef DOUBLE_PRECISION_REAL
745 : * 48 rows of Q simultaneously, a
746 : #endif
747 : * matrix Vector product with two householder
748 : * vectors + a rank 1 update is performed
749 : */
750 : #ifdef DOUBLE_PRECISION_REAL
751 : __forceinline void hh_trafo_kernel_24_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
752 : #endif
753 : #ifdef SINGLE_PRECISION_REAL
754 : __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
755 : #endif
756 : {
757 : /////////////////////////////////////////////////////
758 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
759 : // hh contains four householder vectors
760 : /////////////////////////////////////////////////////
761 : int i;
762 :
763 0 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*3]);
764 0 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*2]);
765 0 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq]);
766 0 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[0]);
767 :
768 :
769 0 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
770 0 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
771 0 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[ldq+offset]);
772 0 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[0+offset]);
773 :
774 0 : __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
775 0 : __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
776 0 : __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[ldq+2*offset]);
777 0 : __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[0+2*offset]);
778 :
779 0 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
780 0 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
781 0 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
782 0 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
783 0 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
784 0 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
785 :
786 0 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
787 0 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
788 0 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
789 0 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
790 0 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
791 0 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
792 0 : __AVX512_DATATYPE x1 = a1_1;
793 :
794 0 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
795 0 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
796 0 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
797 0 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
798 0 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
799 0 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
800 0 : __AVX512_DATATYPE x2 = a1_2;
801 :
802 0 : __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
803 0 : w3 = _AVX512_FMA(a2_3, h_4_2, w3);
804 0 : w3 = _AVX512_FMA(a1_3, h_4_1, w3);
805 0 : __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
806 0 : z3 = _AVX512_FMA(a1_3, h_3_1, z3);
807 0 : __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
808 0 : __AVX512_DATATYPE x3 = a1_3;
809 :
810 : __AVX512_DATATYPE q1;
811 : __AVX512_DATATYPE q2;
812 : __AVX512_DATATYPE q3;
813 :
814 : __AVX512_DATATYPE h1;
815 : __AVX512_DATATYPE h2;
816 : __AVX512_DATATYPE h3;
817 : __AVX512_DATATYPE h4;
818 :
819 0 : for(i = 4; i < nb; i++)
820 : {
821 0 : h1 = _AVX512_SET1(hh[i-3]);
822 0 : h2 = _AVX512_SET1(hh[ldh+i-2]);
823 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
824 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
825 :
826 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
827 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
828 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
829 :
830 0 : x1 = _AVX512_FMA(q1, h1, x1);
831 0 : y1 = _AVX512_FMA(q1, h2, y1);
832 0 : z1 = _AVX512_FMA(q1, h3, z1);
833 0 : w1 = _AVX512_FMA(q1, h4, w1);
834 :
835 0 : x2 = _AVX512_FMA(q2, h1, x2);
836 0 : y2 = _AVX512_FMA(q2, h2, y2);
837 0 : z2 = _AVX512_FMA(q2, h3, z2);
838 0 : w2 = _AVX512_FMA(q2, h4, w2);
839 :
840 0 : x3 = _AVX512_FMA(q3, h1, x3);
841 0 : y3 = _AVX512_FMA(q3, h2, y3);
842 0 : z3 = _AVX512_FMA(q3, h3, z3);
843 0 : w3 = _AVX512_FMA(q3, h4, w3);
844 :
845 : }
846 :
847 0 : h1 = _AVX512_SET1(hh[nb-3]);
848 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
849 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
850 :
851 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
852 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
853 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
854 :
855 0 : x1 = _AVX512_FMA(q1, h1, x1);
856 0 : y1 = _AVX512_FMA(q1, h2, y1);
857 0 : z1 = _AVX512_FMA(q1, h3, z1);
858 :
859 0 : x2 = _AVX512_FMA(q2, h1, x2);
860 0 : y2 = _AVX512_FMA(q2, h2, y2);
861 0 : z2 = _AVX512_FMA(q2, h3, z2);
862 :
863 0 : x3 = _AVX512_FMA(q3, h1, x3);
864 0 : y3 = _AVX512_FMA(q3, h2, y3);
865 0 : z3 = _AVX512_FMA(q3, h3, z3);
866 :
867 0 : h1 = _AVX512_SET1(hh[nb-2]);
868 0 : h2 = _AVX512_SET1(hh[(ldh*1)+nb-1]);
869 :
870 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
871 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
872 0 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
873 :
874 0 : x1 = _AVX512_FMA(q1, h1, x1);
875 0 : y1 = _AVX512_FMA(q1, h2, y1);
876 0 : x2 = _AVX512_FMA(q2, h1, x2);
877 0 : y2 = _AVX512_FMA(q2, h2, y2);
878 0 : x3 = _AVX512_FMA(q3, h1, x3);
879 0 : y3 = _AVX512_FMA(q3, h2, y3);
880 :
881 0 : h1 = _AVX512_SET1(hh[nb-1]);
882 :
883 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
884 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
885 0 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
886 :
887 0 : x1 = _AVX512_FMA(q1, h1, x1);
888 0 : x2 = _AVX512_FMA(q2, h1, x2);
889 0 : x3 = _AVX512_FMA(q3, h1, x3);
890 :
891 : /////////////////////////////////////////////////////
892 : // Rank-1 update of Q [8 x nb+3]
893 : /////////////////////////////////////////////////////
894 :
895 0 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
896 0 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
897 0 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
898 0 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
899 :
900 0 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(s_1_2);
901 0 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(s_1_3);
902 0 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(s_2_3);
903 0 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(s_1_4);
904 0 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(s_2_4);
905 0 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(s_3_4);
906 :
907 0 : h1 = tau1;
908 0 : x1 = _AVX512_MUL(x1, h1);
909 0 : x2 = _AVX512_MUL(x2, h1);
910 0 : x3 = _AVX512_MUL(x3, h1);
911 :
912 0 : h1 = tau2;
913 0 : h2 = _AVX512_MUL(h1, vs_1_2);
914 :
915 0 : y1 = _AVX512_FMSUB(y1, h1, _AVX512_MUL(x1,h2));
916 0 : y2 = _AVX512_FMSUB(y2, h1, _AVX512_MUL(x2,h2));
917 0 : y3 = _AVX512_FMSUB(y3, h1, _AVX512_MUL(x3,h2));
918 :
919 0 : h1 = tau3;
920 0 : h2 = _AVX512_MUL(h1, vs_1_3);
921 0 : h3 = _AVX512_MUL(h1, vs_2_3);
922 :
923 0 : z1 = _AVX512_FMSUB(z1, h1, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
924 0 : z2 = _AVX512_FMSUB(z2, h1, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
925 0 : z3 = _AVX512_FMSUB(z3, h1, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
926 :
927 0 : h1 = tau4;
928 0 : h2 = _AVX512_MUL(h1, vs_1_4);
929 0 : h3 = _AVX512_MUL(h1, vs_2_4);
930 0 : h4 = _AVX512_MUL(h1, vs_3_4);
931 :
932 0 : w1 = _AVX512_FMSUB(w1, h1, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
933 0 : w2 = _AVX512_FMSUB(w2, h1, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
934 0 : w3 = _AVX512_FMSUB(w3, h1, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
935 :
936 0 : q1 = _AVX512_LOAD(&q[0]);
937 0 : q1 = _AVX512_SUB(q1, w1);
938 : _AVX512_STORE(&q[0],q1);
939 :
940 0 : q2 = _AVX512_LOAD(&q[0+offset]);
941 0 : q2 = _AVX512_SUB(q2, w2);
942 0 : _AVX512_STORE(&q[0+offset],q2);
943 :
944 0 : q3 = _AVX512_LOAD(&q[0+2*offset]);
945 0 : q3 = _AVX512_SUB(q3, w3);
946 0 : _AVX512_STORE(&q[0+2*offset],q3);
947 :
948 :
949 0 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
950 0 : q1 = _AVX512_LOAD(&q[ldq]);
951 0 : q2 = _AVX512_LOAD(&q[ldq+offset]);
952 0 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
953 :
954 0 : q1 = _AVX512_SUB(q1, _AVX512_FMA(w1, h4, z1));
955 0 : _AVX512_STORE(&q[ldq],q1);
956 0 : q2 = _AVX512_SUB(q2, _AVX512_FMA(w2, h4, z2));
957 0 : _AVX512_STORE(&q[ldq+offset],q2);
958 0 : q3 = _AVX512_SUB(q3, _AVX512_FMA(w3, h4, z3));
959 0 : _AVX512_STORE(&q[ldq+2*offset],q3);
960 :
961 :
962 0 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
963 0 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
964 0 : q1 = _AVX512_LOAD(&q[ldq*2]);
965 0 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
966 0 : q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
967 :
968 0 : q1 = _AVX512_SUB(q1, y1);
969 0 : q1 = _AVX512_NFMA(z1, h3, q1);
970 0 : q1 = _AVX512_NFMA(w1, h4, q1);
971 :
972 0 : _AVX512_STORE(&q[ldq*2],q1);
973 :
974 0 : q2 = _AVX512_SUB(q2, y2);
975 0 : q2 = _AVX512_NFMA(z2, h3, q2);
976 0 : q2 = _AVX512_NFMA(w2, h4, q2);
977 :
978 0 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
979 :
980 0 : q3 = _AVX512_SUB(q3, y3);
981 0 : q3 = _AVX512_NFMA(z3, h3, q3);
982 0 : q3 = _AVX512_NFMA(w3, h4, q3);
983 :
984 0 : _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
985 :
986 0 : h2 = _AVX512_SET1(hh[ldh+1]);
987 0 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
988 0 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
989 :
990 0 : q1 = _AVX512_LOAD(&q[ldq*3]);
991 :
992 0 : q1 = _AVX512_SUB(q1, x1);
993 0 : q1 = _AVX512_NFMA(y1, h2, q1);
994 0 : q1 = _AVX512_NFMA(z1, h3, q1);
995 0 : q1 = _AVX512_NFMA(w1, h4, q1);
996 :
997 0 : _AVX512_STORE(&q[ldq*3], q1);
998 :
999 0 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1000 :
1001 0 : q2 = _AVX512_SUB(q2, x2);
1002 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1003 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1004 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1005 :
1006 0 : _AVX512_STORE(&q[(ldq*3)+offset], q2);
1007 :
1008 0 : q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
1009 :
1010 0 : q3 = _AVX512_SUB(q3, x3);
1011 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1012 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1013 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1014 :
1015 0 : _AVX512_STORE(&q[(ldq*3)+2*offset], q3);
1016 :
1017 0 : for (i = 4; i < nb; i++)
1018 : {
1019 0 : h1 = _AVX512_SET1(hh[i-3]);
1020 0 : h2 = _AVX512_SET1(hh[ldh+i-2]);
1021 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
1022 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
1023 :
1024 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1025 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1026 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1027 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1028 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1029 0 : _AVX512_STORE(&q[i*ldq],q1);
1030 :
1031 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1032 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1033 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1034 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1035 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1036 0 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
1037 :
1038 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
1039 0 : q3 = _AVX512_NFMA(x3, h1, q3);
1040 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1041 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1042 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1043 0 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
1044 :
1045 : }
1046 :
1047 0 : h1 = _AVX512_SET1(hh[nb-3]);
1048 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1049 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1050 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1051 :
1052 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1053 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1054 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1055 :
1056 0 : _AVX512_STORE(&q[nb*ldq],q1);
1057 :
1058 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
1059 :
1060 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1061 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1062 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1063 :
1064 0 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
1065 :
1066 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
1067 :
1068 0 : q3 = _AVX512_NFMA(x3, h1, q3);
1069 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1070 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1071 :
1072 0 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
1073 :
1074 :
1075 0 : h1 = _AVX512_SET1(hh[nb-2]);
1076 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1077 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1078 :
1079 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1080 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1081 :
1082 0 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
1083 :
1084 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
1085 :
1086 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1087 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1088 :
1089 0 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
1090 :
1091 0 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
1092 :
1093 0 : q3 = _AVX512_NFMA(x3, h1, q3);
1094 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1095 :
1096 0 : _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
1097 :
1098 :
1099 0 : h1 = _AVX512_SET1(hh[nb-1]);
1100 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1101 :
1102 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1103 :
1104 0 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
1105 :
1106 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1107 :
1108 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1109 :
1110 0 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
1111 :
1112 0 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
1113 :
1114 0 : q3 = _AVX512_NFMA(x3, h1, q3);
1115 :
1116 0 : _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
1117 :
1118 : }
1119 :
1120 : /**
1121 : * Unrolled kernel that computes
1122 : #ifdef DOUBLE_PRECISION_REAL
1123 : * 16 rows of Q simultaneously, a
1124 : #endif
1125 : #ifdef SINGLE_PRECISION_REAL
1126 : * 32 rows of Q simultaneously, a
1127 : #endif
1128 : * matrix Vector product with two householder
1129 : * vectors + a rank 1 update is performed
1130 : */
1131 : #ifdef DOUBLE_PRECISION_REAL
1132 : __forceinline void hh_trafo_kernel_16_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
1133 : #endif
1134 : #ifdef SINGLE_PRECISION_REAL
1135 : __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
1136 : #endif
1137 : {
1138 : /////////////////////////////////////////////////////
1139 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
1140 : // hh contains four householder vectors
1141 : /////////////////////////////////////////////////////
1142 : int i;
1143 :
1144 0 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*3]);
1145 0 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*2]);
1146 0 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq]);
1147 0 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[0]);
1148 :
1149 0 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1150 0 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
1151 0 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[ldq+offset]);
1152 0 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[0+offset]);
1153 :
1154 0 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
1155 0 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
1156 0 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
1157 0 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
1158 0 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
1159 0 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
1160 :
1161 0 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
1162 0 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
1163 0 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
1164 0 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
1165 0 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
1166 0 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
1167 0 : __AVX512_DATATYPE x1 = a1_1;
1168 :
1169 : __AVX512_DATATYPE q1;
1170 : __AVX512_DATATYPE q2;
1171 :
1172 0 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
1173 0 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
1174 0 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
1175 0 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
1176 0 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
1177 0 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
1178 0 : __AVX512_DATATYPE x2 = a1_2;
1179 :
1180 : __AVX512_DATATYPE h1;
1181 : __AVX512_DATATYPE h2;
1182 : __AVX512_DATATYPE h3;
1183 : __AVX512_DATATYPE h4;
1184 :
1185 0 : for(i = 4; i < nb; i++)
1186 : {
1187 0 : h1 = _AVX512_SET1(hh[i-3]);
1188 0 : h2 = _AVX512_SET1(hh[ldh+i-2]);
1189 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
1190 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
1191 :
1192 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1193 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1194 :
1195 0 : x1 = _AVX512_FMA(q1, h1, x1);
1196 0 : y1 = _AVX512_FMA(q1, h2, y1);
1197 0 : z1 = _AVX512_FMA(q1, h3, z1);
1198 0 : w1 = _AVX512_FMA(q1, h4, w1);
1199 :
1200 0 : x2 = _AVX512_FMA(q2, h1, x2);
1201 0 : y2 = _AVX512_FMA(q2, h2, y2);
1202 0 : z2 = _AVX512_FMA(q2, h3, z2);
1203 0 : w2 = _AVX512_FMA(q2, h4, w2);
1204 : }
1205 :
1206 0 : h1 = _AVX512_SET1(hh[nb-3]);
1207 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1208 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1209 :
1210 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1211 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
1212 :
1213 0 : x1 = _AVX512_FMA(q1, h1, x1);
1214 0 : y1 = _AVX512_FMA(q1, h2, y1);
1215 0 : z1 = _AVX512_FMA(q1, h3, z1);
1216 :
1217 0 : x2 = _AVX512_FMA(q2, h1, x2);
1218 0 : y2 = _AVX512_FMA(q2, h2, y2);
1219 0 : z2 = _AVX512_FMA(q2, h3, z2);
1220 :
1221 0 : h1 = _AVX512_SET1(hh[nb-2]);
1222 0 : h2 = _AVX512_SET1(hh[(ldh*1)+nb-1]);
1223 :
1224 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1225 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
1226 :
1227 0 : x1 = _AVX512_FMA(q1, h1, x1);
1228 0 : y1 = _AVX512_FMA(q1, h2, y1);
1229 0 : x2 = _AVX512_FMA(q2, h1, x2);
1230 0 : y2 = _AVX512_FMA(q2, h2, y2);
1231 :
1232 0 : h1 = _AVX512_SET1(hh[nb-1]);
1233 :
1234 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1235 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1236 :
1237 0 : x1 = _AVX512_FMA(q1, h1, x1);
1238 0 : x2 = _AVX512_FMA(q2, h1, x2);
1239 :
1240 : /////////////////////////////////////////////////////
1241 : // Rank-1 update of Q [8 x nb+3]
1242 : /////////////////////////////////////////////////////
1243 :
1244 0 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
1245 0 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
1246 0 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
1247 0 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
1248 :
1249 0 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(s_1_2);
1250 0 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(s_1_3);
1251 0 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(s_2_3);
1252 0 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(s_1_4);
1253 0 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(s_2_4);
1254 0 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(s_3_4);
1255 :
1256 0 : h1 = tau1;
1257 0 : x1 = _AVX512_MUL(x1, h1);
1258 0 : x2 = _AVX512_MUL(x2, h1);
1259 :
1260 0 : h1 = tau2;
1261 0 : h2 = _AVX512_MUL(h1, vs_1_2);
1262 :
1263 0 : y1 = _AVX512_FMSUB(y1, h1, _AVX512_MUL(x1,h2));
1264 0 : y2 = _AVX512_FMSUB(y2, h1, _AVX512_MUL(x2,h2));
1265 :
1266 0 : h1 = tau3;
1267 0 : h2 = _AVX512_MUL(h1, vs_1_3);
1268 0 : h3 = _AVX512_MUL(h1, vs_2_3);
1269 :
1270 0 : z1 = _AVX512_FMSUB(z1, h1, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
1271 0 : z2 = _AVX512_FMSUB(z2, h1, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
1272 :
1273 0 : h1 = tau4;
1274 0 : h2 = _AVX512_MUL(h1, vs_1_4);
1275 0 : h3 = _AVX512_MUL(h1, vs_2_4);
1276 0 : h4 = _AVX512_MUL(h1, vs_3_4);
1277 :
1278 0 : w1 = _AVX512_FMSUB(w1, h1, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1279 0 : w2 = _AVX512_FMSUB(w2, h1, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1280 :
1281 0 : q1 = _AVX512_LOAD(&q[0]);
1282 0 : q1 = _AVX512_SUB(q1, w1);
1283 : _AVX512_STORE(&q[0],q1);
1284 :
1285 0 : q2 = _AVX512_LOAD(&q[0+offset]);
1286 0 : q2 = _AVX512_SUB(q2, w2);
1287 0 : _AVX512_STORE(&q[0+offset],q2);
1288 :
1289 :
1290 0 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
1291 0 : q1 = _AVX512_LOAD(&q[ldq]);
1292 0 : q2 = _AVX512_LOAD(&q[ldq+offset]);
1293 :
1294 0 : q1 = _AVX512_SUB(q1, _AVX512_FMA(w1, h4, z1));
1295 0 : _AVX512_STORE(&q[ldq],q1);
1296 0 : q2 = _AVX512_SUB(q2, _AVX512_FMA(w2, h4, z2));
1297 0 : _AVX512_STORE(&q[ldq+offset],q2);
1298 :
1299 0 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
1300 0 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
1301 0 : q1 = _AVX512_LOAD(&q[ldq*2]);
1302 0 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
1303 :
1304 0 : q1 = _AVX512_SUB(q1, y1);
1305 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1306 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1307 :
1308 0 : _AVX512_STORE(&q[ldq*2],q1);
1309 :
1310 0 : q2 = _AVX512_SUB(q2, y2);
1311 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1312 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1313 :
1314 0 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
1315 :
1316 0 : h2 = _AVX512_SET1(hh[ldh+1]);
1317 0 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
1318 0 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
1319 :
1320 0 : q1 = _AVX512_LOAD(&q[ldq*3]);
1321 :
1322 0 : q1 = _AVX512_SUB(q1, x1);
1323 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1324 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1325 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1326 :
1327 0 : _AVX512_STORE(&q[ldq*3], q1);
1328 :
1329 0 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1330 :
1331 0 : q2 = _AVX512_SUB(q2, x2);
1332 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1333 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1334 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1335 :
1336 0 : _AVX512_STORE(&q[(ldq*3)+offset], q2);
1337 :
1338 0 : for (i = 4; i < nb; i++)
1339 : {
1340 0 : h1 = _AVX512_SET1(hh[i-3]);
1341 0 : h2 = _AVX512_SET1(hh[ldh+i-2]);
1342 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
1343 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
1344 :
1345 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1346 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1347 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1348 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1349 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1350 0 : _AVX512_STORE(&q[i*ldq],q1);
1351 :
1352 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1353 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1354 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1355 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1356 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1357 0 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
1358 :
1359 : }
1360 :
1361 0 : h1 = _AVX512_SET1(hh[nb-3]);
1362 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1363 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1364 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1365 :
1366 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1367 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1368 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1369 :
1370 0 : _AVX512_STORE(&q[nb*ldq],q1);
1371 :
1372 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
1373 :
1374 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1375 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1376 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1377 :
1378 0 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
1379 :
1380 0 : h1 = _AVX512_SET1(hh[nb-2]);
1381 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1382 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1383 :
1384 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1385 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1386 :
1387 0 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
1388 :
1389 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
1390 :
1391 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1392 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1393 :
1394 0 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
1395 :
1396 0 : h1 = _AVX512_SET1(hh[nb-1]);
1397 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1398 :
1399 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1400 :
1401 0 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
1402 :
1403 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1404 :
1405 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1406 :
1407 0 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
1408 : }
1409 :
1410 : /**
1411 : * Unrolled kernel that computes
1412 : #ifdef DOUBLE_PRECISION_REAL
1413 : * 8 rows of Q simultaneously, a
1414 : #endif
1415 : #ifdef SINGLE_PRECISION_REAL
1416 : * 16 rows of Q simultaneously, a
1417 : #endif
1418 :
1419 : * matrix Vector product with two householder
1420 : * vectors + a rank 1 update is performed
1421 : */
1422 : #ifdef DOUBLE_PRECISION_REAL
1423 : __forceinline void hh_trafo_kernel_8_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
1424 : #endif
1425 : #ifdef SINGLE_PRECISION_REAL
1426 : __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
1427 : #endif
1428 : {
1429 : /////////////////////////////////////////////////////
1430 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
1431 : // hh contains four householder vectors
1432 : /////////////////////////////////////////////////////
1433 : int i;
1434 :
1435 318592 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*3]);
1436 318592 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*2]);
1437 318592 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq]);
1438 159296 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[0]);
1439 :
1440 318592 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
1441 318592 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
1442 318592 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
1443 318592 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
1444 318592 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
1445 318592 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
1446 :
1447 159296 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
1448 159296 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
1449 159296 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
1450 159296 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
1451 159296 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
1452 159296 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
1453 159296 : __AVX512_DATATYPE x1 = a1_1;
1454 :
1455 : __AVX512_DATATYPE q1;
1456 :
1457 : __AVX512_DATATYPE h1;
1458 : __AVX512_DATATYPE h2;
1459 : __AVX512_DATATYPE h3;
1460 : __AVX512_DATATYPE h4;
1461 :
1462 9717056 : for(i = 4; i < nb; i++)
1463 : {
1464 19115520 : h1 = _AVX512_SET1(hh[i-3]);
1465 19115520 : h2 = _AVX512_SET1(hh[ldh+i-2]);
1466 19115520 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
1467 19115520 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
1468 :
1469 19115520 : q1 = _AVX512_LOAD(&q[i*ldq]);
1470 :
1471 9557760 : x1 = _AVX512_FMA(q1, h1, x1);
1472 9557760 : y1 = _AVX512_FMA(q1, h2, y1);
1473 9557760 : z1 = _AVX512_FMA(q1, h3, z1);
1474 9557760 : w1 = _AVX512_FMA(q1, h4, w1);
1475 :
1476 : }
1477 :
1478 318592 : h1 = _AVX512_SET1(hh[nb-3]);
1479 318592 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1480 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1481 :
1482 318592 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1483 :
1484 159296 : x1 = _AVX512_FMA(q1, h1, x1);
1485 159296 : y1 = _AVX512_FMA(q1, h2, y1);
1486 159296 : z1 = _AVX512_FMA(q1, h3, z1);
1487 :
1488 318592 : h1 = _AVX512_SET1(hh[nb-2]);
1489 318592 : h2 = _AVX512_SET1(hh[(ldh*1)+nb-1]);
1490 :
1491 318592 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1492 :
1493 159296 : x1 = _AVX512_FMA(q1, h1, x1);
1494 159296 : y1 = _AVX512_FMA(q1, h2, y1);
1495 :
1496 318592 : h1 = _AVX512_SET1(hh[nb-1]);
1497 :
1498 318592 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1499 :
1500 159296 : x1 = _AVX512_FMA(q1, h1, x1);
1501 :
1502 : /////////////////////////////////////////////////////
1503 : // Rank-1 update of Q [8 x nb+3]
1504 : /////////////////////////////////////////////////////
1505 :
1506 318592 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
1507 318592 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
1508 318592 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
1509 318592 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
1510 :
1511 159296 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(s_1_2);
1512 159296 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(s_1_3);
1513 159296 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(s_2_3);
1514 159296 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(s_1_4);
1515 159296 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(s_2_4);
1516 159296 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(s_3_4);
1517 :
1518 159296 : h1 = tau1;
1519 159296 : x1 = _AVX512_MUL(x1, h1);
1520 :
1521 159296 : h1 = tau2;
1522 159296 : h2 = _AVX512_MUL(h1, vs_1_2);
1523 :
1524 318592 : y1 = _AVX512_FMSUB(y1, h1, _AVX512_MUL(x1,h2));
1525 :
1526 159296 : h1 = tau3;
1527 159296 : h2 = _AVX512_MUL(h1, vs_1_3);
1528 159296 : h3 = _AVX512_MUL(h1, vs_2_3);
1529 :
1530 477888 : z1 = _AVX512_FMSUB(z1, h1, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
1531 :
1532 159296 : h1 = tau4;
1533 159296 : h2 = _AVX512_MUL(h1, vs_1_4);
1534 159296 : h3 = _AVX512_MUL(h1, vs_2_4);
1535 159296 : h4 = _AVX512_MUL(h1, vs_3_4);
1536 :
1537 637184 : w1 = _AVX512_FMSUB(w1, h1, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1538 :
1539 159296 : q1 = _AVX512_LOAD(&q[0]);
1540 159296 : q1 = _AVX512_SUB(q1, w1);
1541 : _AVX512_STORE(&q[0],q1);
1542 :
1543 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
1544 318592 : q1 = _AVX512_LOAD(&q[ldq]);
1545 :
1546 318592 : q1 = _AVX512_SUB(q1, _AVX512_FMA(w1, h4, z1));
1547 :
1548 159296 : _AVX512_STORE(&q[ldq],q1);
1549 :
1550 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
1551 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
1552 318592 : q1 = _AVX512_LOAD(&q[ldq*2]);
1553 :
1554 159296 : q1 = _AVX512_SUB(q1, y1);
1555 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
1556 159296 : q1 = _AVX512_NFMA(w1, h4, q1);
1557 :
1558 159296 : _AVX512_STORE(&q[ldq*2],q1);
1559 :
1560 318592 : h2 = _AVX512_SET1(hh[ldh+1]);
1561 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
1562 318592 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
1563 :
1564 318592 : q1 = _AVX512_LOAD(&q[ldq*3]);
1565 :
1566 159296 : q1 = _AVX512_SUB(q1, x1);
1567 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
1568 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
1569 159296 : q1 = _AVX512_NFMA(w1, h4, q1);
1570 :
1571 159296 : _AVX512_STORE(&q[ldq*3], q1);
1572 :
1573 9717056 : for (i = 4; i < nb; i++)
1574 : {
1575 19115520 : h1 = _AVX512_SET1(hh[i-3]);
1576 19115520 : h2 = _AVX512_SET1(hh[ldh+i-2]);
1577 19115520 : h3 = _AVX512_SET1(hh[(ldh*2)+i-1]);
1578 19115520 : h4 = _AVX512_SET1(hh[(ldh*3)+i]);
1579 :
1580 19115520 : q1 = _AVX512_LOAD(&q[i*ldq]);
1581 9557760 : q1 = _AVX512_NFMA(x1, h1, q1);
1582 9557760 : q1 = _AVX512_NFMA(y1, h2, q1);
1583 9557760 : q1 = _AVX512_NFMA(z1, h3, q1);
1584 9557760 : q1 = _AVX512_NFMA(w1, h4, q1);
1585 9557760 : _AVX512_STORE(&q[i*ldq],q1);
1586 : }
1587 :
1588 318592 : h1 = _AVX512_SET1(hh[nb-3]);
1589 318592 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1590 318592 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1591 318592 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1592 :
1593 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
1594 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
1595 159296 : q1 = _AVX512_NFMA(z1, h3, q1);
1596 :
1597 159296 : _AVX512_STORE(&q[nb*ldq],q1);
1598 :
1599 318592 : h1 = _AVX512_SET1(hh[nb-2]);
1600 318592 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1601 318592 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1602 :
1603 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
1604 159296 : q1 = _AVX512_NFMA(y1, h2, q1);
1605 :
1606 159296 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
1607 :
1608 318592 : h1 = _AVX512_SET1(hh[nb-1]);
1609 318592 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1610 :
1611 159296 : q1 = _AVX512_NFMA(x1, h1, q1);
1612 :
1613 159296 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
1614 :
1615 :
1616 : }
1617 :
1618 :
1619 : #if 0
1620 : /**
1621 : * Unrolled kernel that computes
1622 : * 4 rows of Q simultaneously, a
1623 : * matrix Vector product with two householder
1624 : * vectors + a rank 1 update is performed
1625 : */
1626 : __forceinline void hh_trafo_kernel_4_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
1627 : {
1628 : /////////////////////////////////////////////////////
1629 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
1630 : // hh contains four householder vectors
1631 : /////////////////////////////////////////////////////
1632 : int i;
1633 :
1634 : __m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
1635 : __m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
1636 : __m256d a3_1 = _mm256_load_pd(&q[ldq]);
1637 : __m256d a4_1 = _mm256_load_pd(&q[0]);
1638 :
1639 : __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
1640 : __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
1641 : __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
1642 : __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
1643 : __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
1644 : __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
1645 :
1646 : #ifdef __ELPA_USE_FMA__
1647 : __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
1648 : w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
1649 : w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
1650 : __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
1651 : z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
1652 : __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
1653 : __m256d x1 = a1_1;
1654 : #else
1655 : __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
1656 : w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
1657 : w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
1658 : __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
1659 : z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
1660 : __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
1661 : __m256d x1 = a1_1;
1662 : #endif
1663 :
1664 : __m256d q1;
1665 :
1666 : __m256d h1;
1667 : __m256d h2;
1668 : __m256d h3;
1669 : __m256d h4;
1670 :
1671 : for(i = 4; i < nb; i++)
1672 : {
1673 : h1 = _mm256_broadcast_sd(&hh[i-3]);
1674 : h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
1675 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
1676 : h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
1677 :
1678 : q1 = _mm256_load_pd(&q[i*ldq]);
1679 : #ifdef __ELPA_USE_FMA__
1680 : x1 = _mm256_FMA_pd(q1, h1, x1);
1681 : y1 = _mm256_FMA_pd(q1, h2, y1);
1682 : z1 = _mm256_FMA_pd(q1, h3, z1);
1683 : w1 = _mm256_FMA_pd(q1, h4, w1);
1684 : #else
1685 : x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
1686 : y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
1687 : z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
1688 : w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
1689 : #endif
1690 : }
1691 :
1692 : h1 = _mm256_broadcast_sd(&hh[nb-3]);
1693 : h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
1694 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
1695 : q1 = _mm256_load_pd(&q[nb*ldq]);
1696 : #ifdef _FMA4__
1697 : x1 = _mm256_FMA_pd(q1, h1, x1);
1698 : y1 = _mm256_FMA_pd(q1, h2, y1);
1699 : z1 = _mm256_FMA_pd(q1, h3, z1);
1700 : #else
1701 : x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
1702 : y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
1703 : z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
1704 : #endif
1705 :
1706 : h1 = _mm256_broadcast_sd(&hh[nb-2]);
1707 : h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
1708 : q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
1709 : #ifdef __ELPA_USE_FMA__
1710 : x1 = _mm256_FMA_pd(q1, h1, x1);
1711 : y1 = _mm256_FMA_pd(q1, h2, y1);
1712 : #else
1713 : x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
1714 : y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
1715 : #endif
1716 :
1717 : h1 = _mm256_broadcast_sd(&hh[nb-1]);
1718 : q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
1719 : #ifdef __ELPA_USE_FMA__
1720 : x1 = _mm256_FMA_pd(q1, h1, x1);
1721 : #else
1722 : x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
1723 : #endif
1724 :
1725 : /////////////////////////////////////////////////////
1726 : // Rank-1 update of Q [4 x nb+3]
1727 : /////////////////////////////////////////////////////
1728 :
1729 : __m256d tau1 = _mm256_broadcast_sd(&hh[0]);
1730 : __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
1731 : __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
1732 : __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
1733 :
1734 : __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
1735 : __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
1736 : __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
1737 : __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
1738 : __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
1739 : __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
1740 :
1741 : h1 = tau1;
1742 : x1 = _mm256_mul_pd(x1, h1);
1743 :
1744 : h1 = tau2;
1745 : h2 = _mm256_mul_pd(h1, vs_1_2);
1746 : #ifdef __ELPA_USE_FMA__
1747 : y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
1748 : #else
1749 : y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
1750 : #endif
1751 :
1752 : h1 = tau3;
1753 : h2 = _mm256_mul_pd(h1, vs_1_3);
1754 : h3 = _mm256_mul_pd(h1, vs_2_3);
1755 : #ifdef __ELPA_USE_FMA__
1756 : z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
1757 : #else
1758 : z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
1759 : #endif
1760 :
1761 : h1 = tau4;
1762 : h2 = _mm256_mul_pd(h1, vs_1_4);
1763 : h3 = _mm256_mul_pd(h1, vs_2_4);
1764 : h4 = _mm256_mul_pd(h1, vs_3_4);
1765 : #ifdef __ELPA_USE_FMA__
1766 : w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
1767 : #else
1768 : w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
1769 : #endif
1770 :
1771 : q1 = _mm256_load_pd(&q[0]);
1772 : q1 = _mm256_sub_pd(q1, w1);
1773 : _mm256_store_pd(&q[0],q1);
1774 :
1775 : h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
1776 : q1 = _mm256_load_pd(&q[ldq]);
1777 : #ifdef __ELPA_USE_FMA__
1778 : q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
1779 : #else
1780 : q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
1781 : #endif
1782 : _mm256_store_pd(&q[ldq],q1);
1783 :
1784 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
1785 : h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
1786 : q1 = _mm256_load_pd(&q[ldq*2]);
1787 : #ifdef __ELPA_USE_FMA__
1788 : q1 = _mm256_sub_pd(q1, y1);
1789 : q1 = _mm256_NFMA_pd(z1, h3, q1);
1790 : q1 = _mm256_NFMA_pd(w1, h4, q1);
1791 : #else
1792 : q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))));
1793 : #endif
1794 : _mm256_store_pd(&q[ldq*2],q1);
1795 :
1796 : h2 = _mm256_broadcast_sd(&hh[ldh+1]);
1797 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
1798 : h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
1799 : q1 = _mm256_load_pd(&q[ldq*3]);
1800 : #ifdef __ELPA_USE_FMA__
1801 : q1 = _mm256_sub_pd(q1, x1);
1802 : q1 = _mm256_NFMA_pd(y1, h2, q1);
1803 : q1 = _mm256_NFMA_pd(z1, h3, q1);
1804 : q1 = _mm256_NFMA_pd(w1, h4, q1);
1805 : #else
1806 : q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))));
1807 : #endif
1808 : _mm256_store_pd(&q[ldq*3], q1);
1809 :
1810 : for (i = 4; i < nb; i++)
1811 : {
1812 : h1 = _mm256_broadcast_sd(&hh[i-3]);
1813 : h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
1814 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
1815 : h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
1816 :
1817 : q1 = _mm256_load_pd(&q[i*ldq]);
1818 : #ifdef __ELPA_USE_FMA__
1819 : q1 = _mm256_NFMA_pd(x1, h1, q1);
1820 : q1 = _mm256_NFMA_pd(y1, h2, q1);
1821 : q1 = _mm256_NFMA_pd(z1, h3, q1);
1822 : q1 = _mm256_NFMA_pd(w1, h4, q1);
1823 : #else
1824 : q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))));
1825 : #endif
1826 : _mm256_store_pd(&q[i*ldq],q1);
1827 : }
1828 :
1829 : h1 = _mm256_broadcast_sd(&hh[nb-3]);
1830 : h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
1831 : h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
1832 : q1 = _mm256_load_pd(&q[nb*ldq]);
1833 : #ifdef __ELPA_USE_FMA__
1834 : q1 = _mm256_NFMA_pd(x1, h1, q1);
1835 : q1 = _mm256_NFMA_pd(y1, h2, q1);
1836 : q1 = _mm256_NFMA_pd(z1, h3, q1);
1837 : #else
1838 : q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1)));
1839 : #endif
1840 : _mm256_store_pd(&q[nb*ldq],q1);
1841 :
1842 : h1 = _mm256_broadcast_sd(&hh[nb-2]);
1843 : h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
1844 : q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
1845 : #ifdef __ELPA_USE_FMA__
1846 : q1 = _mm256_NFMA_pd(x1, h1, q1);
1847 : q1 = _mm256_NFMA_pd(y1, h2, q1);
1848 : #else
1849 : q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1)));
1850 : #endif
1851 : _mm256_store_pd(&q[(nb+1)*ldq],q1);
1852 :
1853 : h1 = _mm256_broadcast_sd(&hh[nb-1]);
1854 : q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
1855 : #ifdef __ELPA_USE_FMA__
1856 : q1 = _mm256_NFMA_pd(x1, h1, q1);
1857 : #else
1858 : q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
1859 : #endif
1860 : _mm256_store_pd(&q[(nb+2)*ldq],q1);
1861 : }
1862 : #endif
1863 :
|