Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46 : // --------------------------------------------------------------------------------------------------
47 :
48 : #include "config-f90.h"
49 : #include <x86intrin.h>
50 : #include <stdio.h>
51 : #include <stdlib.h>
52 :
53 : #define __forceinline __attribute__((always_inline)) static
54 :
55 : #ifdef DOUBLE_PRECISION_REAL
56 : #define offset 8
57 : #define __AVX512_DATATYPE __m512d
58 : #define _AVX512_LOAD _mm512_load_pd
59 : #define _AVX512_STORE _mm512_store_pd
60 : #define _AVX512_SET1 _mm512_set1_pd
61 : #define _AVX512_MUL _mm512_mul_pd
62 : #define _AVX512_ADD _mm512_add_pd
63 : #define _AVX512_SUB _mm512_sub_pd
64 :
65 : #ifdef HAVE_AVX512
66 :
67 : #define __ELPA_USE_FMA__
68 : #define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
69 : #define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
70 : #define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)
71 :
72 : #endif
73 :
74 : #define _AVX512_FMA _mm512_FMA_pd
75 : #define _AVX512_NFMA _mm512_NFMA_pd
76 : #define _AVX512_FMSUB _mm512_FMSUB_pd
77 : #endif /* DOUBLE_PRECISION_REAL */
78 :
79 : #ifdef SINGLE_PRECISION_REAL
80 : #define offset 16
81 : #define __AVX512_DATATYPE __m512
82 : #define _AVX512_LOAD _mm512_load_ps
83 : #define _AVX512_STORE _mm512_store_ps
84 : #define _AVX512_SET1 _mm512_set1_ps
85 : #define _AVX512_MUL _mm512_mul_ps
86 : #define _AVX512_ADD _mm512_add_ps
87 : #define _AVX512_SUB _mm512_sub_ps
88 :
89 : #ifdef HAVE_AVX512
90 :
91 : #define __ELPA_USE_FMA__
92 : #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
93 : #define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
94 : #define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
95 : #endif
96 :
97 : #define _AVX512_FMA _mm512_FMA_ps
98 : #define _AVX512_NFMA _mm512_NFMA_ps
99 : #define _AVX512_FMSUB _mm512_FMSUB_ps
100 : #endif /* SINGLE_PRECISION_REAL */
101 :
102 :
103 :
104 : //Forward declaration
105 : #ifdef DOUBLE_PRECISION_REAL
106 : static void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
107 : static void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
108 : static void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
109 : static void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
110 :
111 : void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
112 : #endif
113 :
114 : #ifdef SINGLE_PRECISION_REAL
115 : static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
116 : static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
117 : static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
118 : static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
119 :
120 : void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
121 :
122 : #endif
123 :
124 : /*
125 : !f>#if defined(HAVE_AVX512)
126 : !f> interface
127 : !f> subroutine hexa_hh_trafo_real_avx512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
128 : !f> bind(C, name="hexa_hh_trafo_real_avx512_6hv_double")
129 : !f> use, intrinsic :: iso_c_binding
130 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
131 : !f> type(c_ptr), value :: q
132 : !f> real(kind=c_double) :: hh(pnb,6)
133 : !f> end subroutine
134 : !f> end interface
135 : !f>#endif
136 : */
137 :
138 :
139 : /*
140 : !f>#if defined(HAVE_AVX512)
141 : !f> interface
142 : !f> subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
143 : !f> bind(C, name="hexa_hh_trafo_real_avx512_6hv_single")
144 : !f> use, intrinsic :: iso_c_binding
145 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
146 : !f> type(c_ptr), value :: q
147 : !f> real(kind=c_float) :: hh(pnb,6)
148 : !f> end subroutine
149 : !f> end interface
150 : !f>#endif
151 : */
152 :
153 :
154 : #ifdef DOUBLE_PRECISION_REAL
155 81920 : void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
156 : #endif
157 : #ifdef SINGLE_PRECISION_REAL
158 15360 : void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
159 : #endif
160 :
161 :
162 : {
163 : int i;
164 97280 : int nb = *pnb;
165 97280 : int nq = *pldq;
166 97280 : int ldq = *pldq;
167 97280 : int ldh = *pldh;
168 : int worked_on;
169 :
170 97280 : worked_on = 0;
171 :
172 : // calculating scalar products to compute
173 : // 6 householder vectors simultaneously
174 : #ifdef DOUBLE_PRECISION_REAL
175 : double scalarprods[15];
176 : #endif
177 : #ifdef SINGLE_PRECISION_REAL
178 : float scalarprods[15];
179 : #endif
180 :
181 97280 : scalarprods[0] = hh[(ldh+1)];
182 97280 : scalarprods[1] = hh[(ldh*2)+2];
183 97280 : scalarprods[2] = hh[(ldh*2)+1];
184 97280 : scalarprods[3] = hh[(ldh*3)+3];
185 97280 : scalarprods[4] = hh[(ldh*3)+2];
186 97280 : scalarprods[5] = hh[(ldh*3)+1];
187 97280 : scalarprods[6] = hh[(ldh*4)+4];
188 97280 : scalarprods[7] = hh[(ldh*4)+3];
189 97280 : scalarprods[8] = hh[(ldh*4)+2];
190 97280 : scalarprods[9] = hh[(ldh*4)+1];
191 97280 : scalarprods[10] = hh[(ldh*5)+5];
192 97280 : scalarprods[11] = hh[(ldh*5)+4];
193 97280 : scalarprods[12] = hh[(ldh*5)+3];
194 97280 : scalarprods[13] = hh[(ldh*5)+2];
195 97280 : scalarprods[14] = hh[(ldh*5)+1];
196 :
197 : // calculate scalar product of first and fourth householder Vector
198 : // loop counter = 2
199 97280 : scalarprods[0] += hh[1] * hh[(2+ldh)];
200 97280 : scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
201 97280 : scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
202 97280 : scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
203 97280 : scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
204 :
205 : // loop counter = 3
206 97280 : scalarprods[0] += hh[2] * hh[(3+ldh)];
207 97280 : scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
208 97280 : scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
209 97280 : scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
210 97280 : scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
211 :
212 97280 : scalarprods[1] += hh[1] * hh[3+(ldh*2)];
213 97280 : scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
214 97280 : scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
215 97280 : scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
216 :
217 : // loop counter = 4
218 97280 : scalarprods[0] += hh[3] * hh[(4+ldh)];
219 97280 : scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
220 97280 : scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
221 97280 : scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
222 97280 : scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
223 :
224 97280 : scalarprods[1] += hh[2] * hh[4+(ldh*2)];
225 97280 : scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
226 97280 : scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
227 97280 : scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
228 :
229 97280 : scalarprods[3] += hh[1] * hh[4+(ldh*3)];
230 97280 : scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
231 97280 : scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
232 :
233 : // loop counter = 5
234 97280 : scalarprods[0] += hh[4] * hh[(5+ldh)];
235 97280 : scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
236 97280 : scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
237 97280 : scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
238 97280 : scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
239 :
240 97280 : scalarprods[1] += hh[3] * hh[5+(ldh*2)];
241 97280 : scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
242 97280 : scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
243 97280 : scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
244 :
245 97280 : scalarprods[3] += hh[2] * hh[5+(ldh*3)];
246 97280 : scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
247 97280 : scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
248 :
249 97280 : scalarprods[6] += hh[1] * hh[5+(ldh*4)];
250 97280 : scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
251 :
252 : #pragma ivdep
253 5739520 : for (i = 6; i < nb; i++)
254 : {
255 5642240 : scalarprods[0] += hh[i-1] * hh[(i+ldh)];
256 5642240 : scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
257 5642240 : scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
258 5642240 : scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
259 5642240 : scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
260 :
261 5642240 : scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
262 5642240 : scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
263 5642240 : scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
264 5642240 : scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
265 :
266 5642240 : scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
267 5642240 : scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
268 5642240 : scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
269 :
270 5642240 : scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
271 5642240 : scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
272 :
273 5642240 : scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
274 : }
275 :
276 :
277 : // Production level kernel calls with padding
278 : #ifdef DOUBLE_PRECISION_REAL
279 163840 : for (i = 0; i < nq-24; i+=32)
280 : {
281 81920 : hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
282 81920 : worked_on += 32;
283 : }
284 : #endif
285 : #ifdef SINGLE_PRECISION_REAL
286 30720 : for (i = 0; i < nq-48; i+=64)
287 : {
288 15360 : hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
289 15360 : worked_on += 64;
290 : }
291 : #endif
292 97280 : if (nq == i)
293 : {
294 0 : return;
295 : }
296 : #ifdef DOUBLE_PRECISION_REAL
297 81920 : if (nq-i == 24)
298 : {
299 0 : hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
300 0 : worked_on += 24;
301 : }
302 : #endif
303 :
304 : #ifdef SINGLE_PRECISION_REAL
305 15360 : if (nq-i ==48)
306 : {
307 0 : hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
308 0 : worked_on += 48;
309 : }
310 : #endif
311 :
312 : #ifdef DOUBLE_PRECISION_REAL
313 81920 : if (nq-i == 16)
314 : {
315 0 : hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
316 0 : worked_on += 16;
317 : }
318 : #endif
319 :
320 : #ifdef SINGLE_PRECISION_REAL
321 15360 : if (nq-i ==32)
322 : {
323 0 : hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
324 0 : worked_on += 32;
325 : }
326 : #endif
327 :
328 : #ifdef DOUBLE_PRECISION_REAL
329 81920 : if (nq-i == 8)
330 : {
331 81920 : hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
332 81920 : worked_on += 8;
333 : }
334 : #endif
335 :
336 : #ifdef SINGLE_PRECISION_REAL
337 15360 : if (nq-i == 16)
338 : {
339 15360 : hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
340 15360 : worked_on += 16;
341 : }
342 : #endif
343 :
344 : #ifdef WITH_DEBUG
345 : if (worked_on != nq)
346 : {
347 : printf("ERROR in avx512 kernel\n");
348 : abort();
349 : }
350 : #endif
351 : }
352 :
353 : /**
354 : * Unrolled kernel that computes
355 : #ifdef DOUBLE_PRECISION_REAL
356 : * 8 rows of Q simultaneously, a
357 : #endif
358 : #ifdef SINGLE_PRECISION_REAL
359 : * 16 rows of Q simultaneously, a
360 : #endif
361 : * matrix Vector product with two householder
362 : * vectors + a rank 1 update is performed
363 : */
364 : #ifdef DOUBLE_PRECISION_REAL
365 : __forceinline void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
366 : #endif
367 : #ifdef SINGLE_PRECISION_REAL
368 : __forceinline void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
369 : #endif
370 :
371 : {
372 : /////////////////////////////////////////////////////
373 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
374 : // hh contains four householder vectors
375 : /////////////////////////////////////////////////////
376 : int i;
377 :
378 194560 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
379 194560 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
380 194560 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
381 194560 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
382 194560 : __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
383 97280 : __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
384 :
385 194560 : __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
386 194560 : __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
387 194560 : __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
388 194560 : __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
389 194560 : __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
390 :
391 : // register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
392 97280 : __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
393 :
394 97280 : t1 = _AVX512_FMA(a4_1, h_6_4, t1);
395 97280 : t1 = _AVX512_FMA(a3_1, h_6_3, t1);
396 97280 : t1 = _AVX512_FMA(a2_1, h_6_2, t1);
397 97280 : t1 = _AVX512_FMA(a1_1, h_6_1, t1);
398 :
399 194560 : __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
400 194560 : __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
401 194560 : __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
402 194560 : __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
403 :
404 : // register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
405 97280 : __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
406 :
407 97280 : v1 = _AVX512_FMA(a3_1, h_5_3, v1);
408 97280 : v1 = _AVX512_FMA(a2_1, h_5_2, v1);
409 97280 : v1 = _AVX512_FMA(a1_1, h_5_1, v1);
410 :
411 194560 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
412 194560 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
413 194560 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
414 :
415 : // register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
416 97280 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
417 :
418 97280 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
419 97280 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
420 :
421 194560 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
422 194560 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
423 194560 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
424 :
425 : // register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
426 97280 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
427 :
428 97280 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
429 : // register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
430 97280 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
431 :
432 : // register __AVX512_DATATYPE x1 = a1_1;
433 97280 : __AVX512_DATATYPE x1 = a1_1;
434 :
435 : __AVX512_DATATYPE q1;
436 :
437 : __AVX512_DATATYPE h1;
438 : __AVX512_DATATYPE h2;
439 : __AVX512_DATATYPE h3;
440 : __AVX512_DATATYPE h4;
441 : __AVX512_DATATYPE h5;
442 : __AVX512_DATATYPE h6;
443 :
444 5739520 : for(i = 6; i < nb; i++)
445 : {
446 11284480 : h1 = _AVX512_SET1(hh[i-5]);
447 11284480 : q1 = _AVX512_LOAD(&q[i*ldq]);
448 :
449 5642240 : x1 = _AVX512_FMA(q1, h1, x1);
450 :
451 11284480 : h2 = _AVX512_SET1(hh[ldh+i-4]);
452 :
453 5642240 : y1 = _AVX512_FMA(q1, h2, y1);
454 11284480 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
455 :
456 5642240 : z1 = _AVX512_FMA(q1, h3, z1);
457 :
458 11284480 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
459 :
460 5642240 : w1 = _AVX512_FMA(q1, h4, w1);
461 :
462 11284480 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
463 :
464 5642240 : v1 = _AVX512_FMA(q1, h5, v1);
465 :
466 11284480 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
467 :
468 5642240 : t1 = _AVX512_FMA(q1, h6, t1);
469 : }
470 :
471 194560 : h1 = _AVX512_SET1(hh[nb-5]);
472 194560 : q1 = _AVX512_LOAD(&q[nb*ldq]);
473 :
474 97280 : x1 = _AVX512_FMA(q1, h1, x1);
475 :
476 194560 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
477 :
478 97280 : y1 = _AVX512_FMA(q1, h2, y1);
479 :
480 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
481 :
482 97280 : z1 = _AVX512_FMA(q1, h3, z1);
483 :
484 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
485 :
486 97280 : w1 = _AVX512_FMA(q1, h4, w1);
487 :
488 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
489 :
490 97280 : v1 = _AVX512_FMA(q1, h5, v1);
491 :
492 194560 : h1 = _AVX512_SET1(hh[nb-4]);
493 :
494 194560 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
495 :
496 97280 : x1 = _AVX512_FMA(q1, h1, x1);
497 :
498 194560 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
499 :
500 97280 : y1 = _AVX512_FMA(q1, h2, y1);
501 :
502 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
503 :
504 97280 : z1 = _AVX512_FMA(q1, h3, z1);
505 :
506 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
507 :
508 97280 : w1 = _AVX512_FMA(q1, h4, w1);
509 :
510 194560 : h1 = _AVX512_SET1(hh[nb-3]);
511 194560 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
512 :
513 97280 : x1 = _AVX512_FMA(q1, h1, x1);
514 :
515 194560 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
516 :
517 97280 : y1 = _AVX512_FMA(q1, h2, y1);
518 :
519 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
520 :
521 97280 : z1 = _AVX512_FMA(q1, h3, z1);
522 :
523 194560 : h1 = _AVX512_SET1(hh[nb-2]);
524 194560 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
525 :
526 97280 : x1 = _AVX512_FMA(q1, h1, x1);
527 :
528 194560 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
529 :
530 97280 : y1 = _AVX512_FMA(q1, h2, y1);
531 :
532 194560 : h1 = _AVX512_SET1(hh[nb-1]);
533 194560 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
534 :
535 97280 : x1 = _AVX512_FMA(q1, h1, x1);
536 :
537 : /////////////////////////////////////////////////////
538 : // Apply tau, correct wrong calculation using pre-calculated scalar products
539 : /////////////////////////////////////////////////////
540 :
541 194560 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
542 97280 : x1 = _AVX512_MUL(x1, tau1);
543 :
544 194560 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
545 194560 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
546 97280 : h2 = _AVX512_MUL(tau2, vs_1_2);
547 :
548 194560 : y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
549 :
550 194560 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
551 194560 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
552 194560 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
553 :
554 97280 : h2 = _AVX512_MUL(tau3, vs_1_3);
555 97280 : h3 = _AVX512_MUL(tau3, vs_2_3);
556 :
557 291840 : z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
558 :
559 194560 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
560 194560 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
561 194560 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
562 :
563 97280 : h2 = _AVX512_MUL(tau4, vs_1_4);
564 97280 : h3 = _AVX512_MUL(tau4, vs_2_4);
565 :
566 194560 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
567 97280 : h4 = _AVX512_MUL(tau4, vs_3_4);
568 :
569 389120 : w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
570 :
571 194560 : __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
572 194560 : __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
573 194560 : __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
574 :
575 97280 : h2 = _AVX512_MUL(tau5, vs_1_5);
576 97280 : h3 = _AVX512_MUL(tau5, vs_2_5);
577 :
578 194560 : __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
579 194560 : __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
580 :
581 97280 : h4 = _AVX512_MUL(tau5, vs_3_5);
582 97280 : h5 = _AVX512_MUL(tau5, vs_4_5);
583 :
584 583680 : v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
585 :
586 194560 : __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
587 194560 : __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
588 194560 : __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
589 97280 : h2 = _AVX512_MUL(tau6, vs_1_6);
590 97280 : h3 = _AVX512_MUL(tau6, vs_2_6);
591 :
592 194560 : __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
593 194560 : __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
594 194560 : __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
595 :
596 97280 : h4 = _AVX512_MUL(tau6, vs_3_6);
597 97280 : h5 = _AVX512_MUL(tau6, vs_4_6);
598 97280 : h6 = _AVX512_MUL(tau6, vs_5_6);
599 :
600 680960 : t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
601 :
602 : /////////////////////////////////////////////////////
603 : // Rank-1 update of Q [8 x nb+3]
604 : /////////////////////////////////////////////////////
605 :
606 97280 : q1 = _AVX512_LOAD(&q[0]);
607 97280 : q1 = _AVX512_SUB(q1, t1);
608 : _AVX512_STORE(&q[0],q1);
609 :
610 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+1]);
611 194560 : q1 = _AVX512_LOAD(&q[ldq]);
612 97280 : q1 = _AVX512_SUB(q1, v1);
613 :
614 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
615 :
616 97280 : _AVX512_STORE(&q[ldq],q1);
617 :
618 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+1]);
619 194560 : q1 = _AVX512_LOAD(&q[ldq*2]);
620 97280 : q1 = _AVX512_SUB(q1, w1);
621 :
622 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
623 :
624 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+2]);
625 :
626 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
627 :
628 97280 : _AVX512_STORE(&q[ldq*2],q1);
629 :
630 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
631 194560 : q1 = _AVX512_LOAD(&q[ldq*3]);
632 97280 : q1 = _AVX512_SUB(q1, z1);
633 :
634 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
635 :
636 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+2]);
637 :
638 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
639 :
640 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+3]);
641 :
642 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
643 :
644 97280 : _AVX512_STORE(&q[ldq*3],q1);
645 :
646 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
647 194560 : q1 = _AVX512_LOAD(&q[ldq*4]);
648 97280 : q1 = _AVX512_SUB(q1, y1);
649 :
650 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
651 :
652 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
653 :
654 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
655 :
656 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+3]);
657 :
658 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
659 :
660 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+4]);
661 :
662 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
663 :
664 97280 : _AVX512_STORE(&q[ldq*4],q1);
665 :
666 194560 : h2 = _AVX512_SET1(hh[(ldh)+1]);
667 194560 : q1 = _AVX512_LOAD(&q[ldq*5]);
668 97280 : q1 = _AVX512_SUB(q1, x1);
669 :
670 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
671 :
672 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
673 :
674 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
675 :
676 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
677 :
678 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
679 :
680 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+4]);
681 :
682 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
683 :
684 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+5]);
685 :
686 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
687 :
688 97280 : _AVX512_STORE(&q[ldq*5],q1);
689 :
690 5739520 : for (i = 6; i < nb; i++)
691 : {
692 11284480 : q1 = _AVX512_LOAD(&q[i*ldq]);
693 11284480 : h1 = _AVX512_SET1(hh[i-5]);
694 :
695 5642240 : q1 = _AVX512_NFMA(x1, h1, q1);
696 :
697 11284480 : h2 = _AVX512_SET1(hh[ldh+i-4]);
698 :
699 5642240 : q1 = _AVX512_NFMA(y1, h2, q1);
700 :
701 11284480 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
702 :
703 5642240 : q1 = _AVX512_NFMA(z1, h3, q1);
704 :
705 11284480 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
706 :
707 5642240 : q1 = _AVX512_NFMA(w1, h4, q1);
708 :
709 11284480 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
710 :
711 5642240 : q1 = _AVX512_NFMA(v1, h5, q1);
712 :
713 11284480 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
714 :
715 5642240 : q1 = _AVX512_NFMA(t1, h6, q1);
716 :
717 5642240 : _AVX512_STORE(&q[i*ldq],q1);
718 : }
719 :
720 194560 : h1 = _AVX512_SET1(hh[nb-5]);
721 194560 : q1 = _AVX512_LOAD(&q[nb*ldq]);
722 :
723 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
724 :
725 194560 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
726 :
727 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
728 :
729 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
730 :
731 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
732 :
733 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
734 :
735 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
736 :
737 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
738 :
739 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
740 :
741 97280 : _AVX512_STORE(&q[nb*ldq],q1);
742 :
743 194560 : h1 = _AVX512_SET1(hh[nb-4]);
744 194560 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
745 :
746 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
747 :
748 194560 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
749 :
750 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
751 :
752 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
753 :
754 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
755 :
756 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
757 :
758 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
759 :
760 97280 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
761 :
762 194560 : h1 = _AVX512_SET1(hh[nb-3]);
763 194560 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
764 :
765 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
766 :
767 194560 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
768 :
769 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
770 :
771 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
772 :
773 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
774 :
775 97280 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
776 :
777 194560 : h1 = _AVX512_SET1(hh[nb-2]);
778 194560 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
779 :
780 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
781 :
782 194560 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
783 :
784 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
785 :
786 97280 : _AVX512_STORE(&q[(nb+3)*ldq],q1);
787 :
788 194560 : h1 = _AVX512_SET1(hh[nb-1]);
789 194560 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
790 :
791 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
792 :
793 97280 : _AVX512_STORE(&q[(nb+4)*ldq],q1);
794 : }
795 :
796 : /**
797 : * Unrolled kernel that computes
798 : #ifdef DOUBLE_PRECISION_REAL
799 : * 16 rows of Q simultaneously, a
800 : #endif
801 : #ifdef SINGLE_PRECISION_REAL
802 : * 32 rows of Q simultaneously, a
803 : #endif
804 : * matrix Vector product with two householder
805 : * vectors + a rank 1 update is performed
806 : */
807 : #ifdef DOUBLE_PRECISION_REAL
808 : __forceinline void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
809 : #endif
810 : #ifdef SINGLE_PRECISION_REAL
811 : __forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
812 : #endif
813 : {
814 : /////////////////////////////////////////////////////
815 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
816 : // hh contains four householder vectors
817 : /////////////////////////////////////////////////////
818 : int i;
819 :
820 0 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
821 0 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
822 0 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
823 0 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
824 0 : __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
825 0 : __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
826 :
827 0 : __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
828 0 : __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
829 0 : __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
830 0 : __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
831 0 : __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
832 :
833 : // register__AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
834 0 : __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
835 :
836 0 : t1 = _AVX512_FMA(a4_1, h_6_4, t1);
837 0 : t1 = _AVX512_FMA(a3_1, h_6_3, t1);
838 0 : t1 = _AVX512_FMA(a2_1, h_6_2, t1);
839 0 : t1 = _AVX512_FMA(a1_1, h_6_1, t1);
840 :
841 0 : __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
842 0 : __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
843 0 : __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
844 0 : __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
845 :
846 : // register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
847 0 : __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
848 :
849 0 : v1 = _AVX512_FMA(a3_1, h_5_3, v1);
850 0 : v1 = _AVX512_FMA(a2_1, h_5_2, v1);
851 0 : v1 = _AVX512_FMA(a1_1, h_5_1, v1);
852 :
853 0 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
854 0 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
855 0 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
856 :
857 : // register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
858 0 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
859 :
860 0 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
861 0 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
862 :
863 0 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
864 0 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
865 0 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
866 :
867 : // register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
868 0 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
869 :
870 0 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
871 : // register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
872 0 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
873 :
874 :
875 : // register __AVX512_DATATYPE x1 = a1_1;
876 0 : __AVX512_DATATYPE x1 = a1_1;
877 :
878 :
879 0 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
880 0 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
881 0 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
882 0 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
883 0 : __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
884 0 : __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
885 :
886 : // register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
887 0 : __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
888 :
889 0 : t2 = _AVX512_FMA(a4_2, h_6_4, t2);
890 0 : t2 = _AVX512_FMA(a3_2, h_6_3, t2);
891 0 : t2 = _AVX512_FMA(a2_2, h_6_2, t2);
892 0 : t2 = _AVX512_FMA(a1_2, h_6_1, t2);
893 :
894 : // register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
895 0 : __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
896 :
897 0 : v2 = _AVX512_FMA(a3_2, h_5_3, v2);
898 0 : v2 = _AVX512_FMA(a2_2, h_5_2, v2);
899 0 : v2 = _AVX512_FMA(a1_2, h_5_1, v2);
900 :
901 : // register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
902 0 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
903 :
904 0 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
905 0 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
906 :
907 : // register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
908 0 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
909 :
910 0 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
911 : // register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
912 0 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
913 :
914 :
915 : // register __AVX512_DATATYPE x2 = a1_2;
916 0 : __AVX512_DATATYPE x2 = a1_2;
917 :
918 : __AVX512_DATATYPE q1;
919 : __AVX512_DATATYPE q2;
920 :
921 : __AVX512_DATATYPE h1;
922 : __AVX512_DATATYPE h2;
923 : __AVX512_DATATYPE h3;
924 : __AVX512_DATATYPE h4;
925 : __AVX512_DATATYPE h5;
926 : __AVX512_DATATYPE h6;
927 :
928 0 : for(i = 6; i < nb; i++)
929 : {
930 0 : h1 = _AVX512_SET1(hh[i-5]);
931 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
932 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
933 :
934 0 : x1 = _AVX512_FMA(q1, h1, x1);
935 0 : x2 = _AVX512_FMA(q2, h1, x2);
936 :
937 0 : h2 = _AVX512_SET1(hh[ldh+i-4]);
938 :
939 0 : y1 = _AVX512_FMA(q1, h2, y1);
940 0 : y2 = _AVX512_FMA(q2, h2, y2);
941 :
942 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
943 :
944 0 : z1 = _AVX512_FMA(q1, h3, z1);
945 0 : z2 = _AVX512_FMA(q2, h3, z2);
946 :
947 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
948 :
949 0 : w1 = _AVX512_FMA(q1, h4, w1);
950 0 : w2 = _AVX512_FMA(q2, h4, w2);
951 :
952 0 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
953 :
954 0 : v1 = _AVX512_FMA(q1, h5, v1);
955 0 : v2 = _AVX512_FMA(q2, h5, v2);
956 :
957 0 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
958 :
959 0 : t1 = _AVX512_FMA(q1, h6, t1);
960 0 : t2 = _AVX512_FMA(q2, h6, t2);
961 : }
962 :
963 0 : h1 = _AVX512_SET1(hh[nb-5]);
964 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
965 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
966 :
967 0 : x1 = _AVX512_FMA(q1, h1, x1);
968 0 : x2 = _AVX512_FMA(q2, h1, x2);
969 :
970 0 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
971 :
972 0 : y1 = _AVX512_FMA(q1, h2, y1);
973 0 : y2 = _AVX512_FMA(q2, h2, y2);
974 :
975 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
976 :
977 0 : z1 = _AVX512_FMA(q1, h3, z1);
978 0 : z2 = _AVX512_FMA(q2, h3, z2);
979 :
980 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
981 :
982 0 : w1 = _AVX512_FMA(q1, h4, w1);
983 0 : w2 = _AVX512_FMA(q2, h4, w2);
984 :
985 0 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
986 :
987 0 : v1 = _AVX512_FMA(q1, h5, v1);
988 0 : v2 = _AVX512_FMA(q2, h5, v2);
989 :
990 0 : h1 = _AVX512_SET1(hh[nb-4]);
991 :
992 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
993 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
994 :
995 0 : x1 = _AVX512_FMA(q1, h1, x1);
996 0 : x2 = _AVX512_FMA(q2, h1, x2);
997 :
998 0 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
999 :
1000 0 : y1 = _AVX512_FMA(q1, h2, y1);
1001 0 : y2 = _AVX512_FMA(q2, h2, y2);
1002 :
1003 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
1004 :
1005 0 : z1 = _AVX512_FMA(q1, h3, z1);
1006 0 : z2 = _AVX512_FMA(q2, h3, z2);
1007 :
1008 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
1009 :
1010 0 : w1 = _AVX512_FMA(q1, h4, w1);
1011 0 : w2 = _AVX512_FMA(q2, h4, w2);
1012 :
1013 0 : h1 = _AVX512_SET1(hh[nb-3]);
1014 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1015 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1016 :
1017 0 : x1 = _AVX512_FMA(q1, h1, x1);
1018 0 : x2 = _AVX512_FMA(q2, h1, x2);
1019 :
1020 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1021 :
1022 0 : y1 = _AVX512_FMA(q1, h2, y1);
1023 0 : y2 = _AVX512_FMA(q2, h2, y2);
1024 :
1025 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1026 :
1027 0 : z1 = _AVX512_FMA(q1, h3, z1);
1028 0 : z2 = _AVX512_FMA(q2, h3, z2);
1029 :
1030 0 : h1 = _AVX512_SET1(hh[nb-2]);
1031 0 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
1032 0 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
1033 :
1034 0 : x1 = _AVX512_FMA(q1, h1, x1);
1035 0 : x2 = _AVX512_FMA(q2, h1, x2);
1036 :
1037 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1038 :
1039 0 : y1 = _AVX512_FMA(q1, h2, y1);
1040 0 : y2 = _AVX512_FMA(q2, h2, y2);
1041 :
1042 0 : h1 = _AVX512_SET1(hh[nb-1]);
1043 0 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
1044 0 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
1045 :
1046 0 : x1 = _AVX512_FMA(q1, h1, x1);
1047 0 : x2 = _AVX512_FMA(q2, h1, x2);
1048 :
1049 : /////////////////////////////////////////////////////
1050 : // Apply tau, correct wrong calculation using pre-calculated scalar products
1051 : /////////////////////////////////////////////////////
1052 :
1053 0 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
1054 0 : x1 = _AVX512_MUL(x1, tau1);
1055 0 : x2 = _AVX512_MUL(x2, tau1);
1056 :
1057 0 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
1058 0 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
1059 0 : h2 = _AVX512_MUL(tau2, vs_1_2);
1060 :
1061 0 : y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
1062 0 : y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
1063 :
1064 0 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
1065 0 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
1066 0 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
1067 :
1068 0 : h2 = _AVX512_MUL(tau3, vs_1_3);
1069 0 : h3 = _AVX512_MUL(tau3, vs_2_3);
1070 :
1071 0 : z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
1072 0 : z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
1073 :
1074 0 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
1075 0 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
1076 0 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
1077 :
1078 0 : h2 = _AVX512_MUL(tau4, vs_1_4);
1079 0 : h3 = _AVX512_MUL(tau4, vs_2_4);
1080 :
1081 0 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
1082 0 : h4 = _AVX512_MUL(tau4, vs_3_4);
1083 :
1084 0 : w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1085 0 : w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1086 :
1087 0 : __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
1088 0 : __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
1089 0 : __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
1090 :
1091 0 : h2 = _AVX512_MUL(tau5, vs_1_5);
1092 0 : h3 = _AVX512_MUL(tau5, vs_2_5);
1093 :
1094 0 : __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
1095 0 : __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
1096 :
1097 0 : h4 = _AVX512_MUL(tau5, vs_3_5);
1098 0 : h5 = _AVX512_MUL(tau5, vs_4_5);
1099 :
1100 0 : v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1101 0 : v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1102 :
1103 0 : __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
1104 0 : __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
1105 0 : __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
1106 0 : h2 = _AVX512_MUL(tau6, vs_1_6);
1107 0 : h3 = _AVX512_MUL(tau6, vs_2_6);
1108 :
1109 0 : __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
1110 0 : __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
1111 0 : __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
1112 :
1113 0 : h4 = _AVX512_MUL(tau6, vs_3_6);
1114 0 : h5 = _AVX512_MUL(tau6, vs_4_6);
1115 0 : h6 = _AVX512_MUL(tau6, vs_5_6);
1116 :
1117 0 : t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
1118 0 : t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
1119 :
1120 : /////////////////////////////////////////////////////
1121 : // Rank-1 update of Q [8 x nb+3]
1122 : /////////////////////////////////////////////////////
1123 :
1124 0 : q1 = _AVX512_LOAD(&q[0]);
1125 0 : q2 = _AVX512_LOAD(&q[0+offset]);
1126 :
1127 0 : q1 = _AVX512_SUB(q1, t1);
1128 0 : q2 = _AVX512_SUB(q2, t2);
1129 :
1130 : _AVX512_STORE(&q[0],q1);
1131 0 : _AVX512_STORE(&q[0+offset],q2);
1132 :
1133 0 : h6 = _AVX512_SET1(hh[(ldh*5)+1]);
1134 0 : q1 = _AVX512_LOAD(&q[ldq]);
1135 0 : q2 = _AVX512_LOAD(&q[ldq+offset]);
1136 :
1137 0 : q1 = _AVX512_SUB(q1, v1);
1138 0 : q2 = _AVX512_SUB(q2, v2);
1139 :
1140 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1141 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1142 :
1143 0 : _AVX512_STORE(&q[ldq],q1);
1144 0 : _AVX512_STORE(&q[ldq+offset],q2);
1145 :
1146 0 : h5 = _AVX512_SET1(hh[(ldh*4)+1]);
1147 0 : q1 = _AVX512_LOAD(&q[ldq*2]);
1148 0 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
1149 :
1150 0 : q1 = _AVX512_SUB(q1, w1);
1151 0 : q2 = _AVX512_SUB(q2, w2);
1152 :
1153 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1154 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1155 :
1156 0 : h6 = _AVX512_SET1(hh[(ldh*5)+2]);
1157 :
1158 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1159 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1160 :
1161 0 : _AVX512_STORE(&q[ldq*2],q1);
1162 0 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
1163 :
1164 0 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
1165 0 : q1 = _AVX512_LOAD(&q[ldq*3]);
1166 0 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1167 :
1168 0 : q1 = _AVX512_SUB(q1, z1);
1169 0 : q2 = _AVX512_SUB(q2, z2);
1170 :
1171 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1172 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1173 :
1174 0 : h5 = _AVX512_SET1(hh[(ldh*4)+2]);
1175 :
1176 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1177 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1178 :
1179 0 : h6 = _AVX512_SET1(hh[(ldh*5)+3]);
1180 :
1181 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1182 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1183 :
1184 0 : _AVX512_STORE(&q[ldq*3],q1);
1185 0 : _AVX512_STORE(&q[(ldq*3)+offset],q2);
1186 :
1187 0 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
1188 0 : q1 = _AVX512_LOAD(&q[ldq*4]);
1189 0 : q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
1190 :
1191 0 : q1 = _AVX512_SUB(q1, y1);
1192 0 : q2 = _AVX512_SUB(q2, y2);
1193 :
1194 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1195 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1196 :
1197 0 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
1198 :
1199 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1200 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1201 :
1202 0 : h5 = _AVX512_SET1(hh[(ldh*4)+3]);
1203 :
1204 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1205 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1206 :
1207 0 : h6 = _AVX512_SET1(hh[(ldh*5)+4]);
1208 :
1209 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1210 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1211 :
1212 0 : _AVX512_STORE(&q[ldq*4],q1);
1213 0 : _AVX512_STORE(&q[(ldq*4)+offset],q2);
1214 :
1215 0 : h2 = _AVX512_SET1(hh[(ldh)+1]);
1216 0 : q1 = _AVX512_LOAD(&q[ldq*5]);
1217 0 : q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
1218 :
1219 0 : q1 = _AVX512_SUB(q1, x1);
1220 0 : q2 = _AVX512_SUB(q2, x2);
1221 :
1222 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1223 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1224 :
1225 0 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
1226 :
1227 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1228 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1229 :
1230 0 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
1231 :
1232 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1233 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1234 :
1235 0 : h5 = _AVX512_SET1(hh[(ldh*4)+4]);
1236 :
1237 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1238 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1239 :
1240 0 : h6 = _AVX512_SET1(hh[(ldh*5)+5]);
1241 :
1242 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1243 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1244 :
1245 0 : _AVX512_STORE(&q[ldq*5],q1);
1246 0 : _AVX512_STORE(&q[(ldq*5)+offset],q2);
1247 :
1248 0 : for (i = 6; i < nb; i++)
1249 : {
1250 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1251 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1252 :
1253 0 : h1 = _AVX512_SET1(hh[i-5]);
1254 :
1255 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1256 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1257 :
1258 0 : h2 = _AVX512_SET1(hh[ldh+i-4]);
1259 :
1260 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1261 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1262 :
1263 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
1264 :
1265 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1266 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1267 :
1268 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
1269 :
1270 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1271 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1272 :
1273 0 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
1274 :
1275 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1276 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1277 :
1278 0 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
1279 :
1280 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1281 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1282 :
1283 0 : _AVX512_STORE(&q[i*ldq],q1);
1284 0 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
1285 :
1286 : }
1287 :
1288 0 : h1 = _AVX512_SET1(hh[nb-5]);
1289 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1290 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
1291 :
1292 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1293 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1294 :
1295 0 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
1296 :
1297 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1298 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1299 :
1300 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
1301 :
1302 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1303 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1304 :
1305 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
1306 :
1307 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1308 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1309 :
1310 0 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
1311 :
1312 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1313 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1314 :
1315 0 : _AVX512_STORE(&q[nb*ldq],q1);
1316 0 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
1317 :
1318 0 : h1 = _AVX512_SET1(hh[nb-4]);
1319 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1320 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
1321 :
1322 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1323 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1324 :
1325 0 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
1326 :
1327 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1328 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1329 :
1330 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
1331 :
1332 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1333 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1334 :
1335 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
1336 :
1337 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1338 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1339 :
1340 0 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
1341 0 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
1342 :
1343 0 : h1 = _AVX512_SET1(hh[nb-3]);
1344 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1345 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1346 :
1347 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1348 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1349 :
1350 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1351 :
1352 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1353 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1354 :
1355 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1356 :
1357 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1358 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1359 :
1360 0 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
1361 0 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
1362 :
1363 0 : h1 = _AVX512_SET1(hh[nb-2]);
1364 0 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
1365 0 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
1366 :
1367 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1368 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1369 :
1370 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1371 :
1372 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1373 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1374 :
1375 0 : _AVX512_STORE(&q[(nb+3)*ldq],q1);
1376 0 : _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
1377 :
1378 0 : h1 = _AVX512_SET1(hh[nb-1]);
1379 0 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
1380 0 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
1381 :
1382 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1383 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1384 :
1385 0 : _AVX512_STORE(&q[(nb+4)*ldq],q1);
1386 0 : _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
1387 :
1388 : }
1389 :
1390 : /**
1391 : * Unrolled kernel that computes
1392 : #ifdef DOUBLE_PRECISION_REAL
1393 : * 24 rows of Q simultaneously, a
1394 : #endif
1395 : #ifdef DOUBLE_PRECISION_REAL
1396 : * 48 rows of Q simultaneously, a
1397 : #endif
1398 :
1399 : * matrix Vector product with two householder
1400 : * vectors + a rank 1 update is performed
1401 : */
1402 : #ifdef DOUBLE_PRECISION_REAL
1403 : __forceinline void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
1404 : #endif
1405 : #ifdef SINGLE_PRECISION_REAL
1406 : __forceinline void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
1407 : #endif
1408 : {
1409 : /////////////////////////////////////////////////////
1410 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
1411 : // hh contains four householder vectors
1412 : /////////////////////////////////////////////////////
1413 : int i;
1414 :
1415 0 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
1416 0 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
1417 0 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
1418 0 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
1419 0 : __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
1420 0 : __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
1421 :
1422 0 : __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
1423 0 : __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
1424 0 : __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
1425 0 : __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
1426 0 : __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
1427 :
1428 : // register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
1429 0 : __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
1430 :
1431 0 : t1 = _AVX512_FMA(a4_1, h_6_4, t1);
1432 0 : t1 = _AVX512_FMA(a3_1, h_6_3, t1);
1433 0 : t1 = _AVX512_FMA(a2_1, h_6_2, t1);
1434 0 : t1 = _AVX512_FMA(a1_1, h_6_1, t1);
1435 :
1436 0 : __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
1437 0 : __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
1438 0 : __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
1439 0 : __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
1440 :
1441 : // register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
1442 0 : __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
1443 :
1444 0 : v1 = _AVX512_FMA(a3_1, h_5_3, v1);
1445 0 : v1 = _AVX512_FMA(a2_1, h_5_2, v1);
1446 0 : v1 = _AVX512_FMA(a1_1, h_5_1, v1);
1447 :
1448 0 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
1449 0 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
1450 0 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
1451 :
1452 : // register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
1453 0 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
1454 :
1455 0 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
1456 0 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
1457 :
1458 0 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
1459 0 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
1460 0 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
1461 :
1462 : // register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
1463 0 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
1464 :
1465 0 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
1466 : // register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
1467 0 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
1468 :
1469 :
1470 : // register __AVX512_DATATYPE x1 = a1_1;
1471 0 : __AVX512_DATATYPE x1 = a1_1;
1472 :
1473 :
1474 0 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
1475 0 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
1476 0 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1477 0 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
1478 0 : __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
1479 0 : __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
1480 :
1481 : // register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
1482 0 : __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
1483 :
1484 0 : t2 = _AVX512_FMA(a4_2, h_6_4, t2);
1485 0 : t2 = _AVX512_FMA(a3_2, h_6_3, t2);
1486 0 : t2 = _AVX512_FMA(a2_2, h_6_2, t2);
1487 0 : t2 = _AVX512_FMA(a1_2, h_6_1, t2);
1488 :
1489 : // register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
1490 0 : __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
1491 :
1492 0 : v2 = _AVX512_FMA(a3_2, h_5_3, v2);
1493 0 : v2 = _AVX512_FMA(a2_2, h_5_2, v2);
1494 0 : v2 = _AVX512_FMA(a1_2, h_5_1, v2);
1495 :
1496 : // register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
1497 0 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
1498 :
1499 0 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
1500 0 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
1501 :
1502 : // register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
1503 0 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
1504 :
1505 0 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
1506 : // register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
1507 0 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
1508 :
1509 :
1510 : // register __AVX512_DATATYPE x2 = a1_2;
1511 0 : __AVX512_DATATYPE x2 = a1_2;
1512 :
1513 :
1514 0 : __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
1515 0 : __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
1516 0 : __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
1517 0 : __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
1518 0 : __AVX512_DATATYPE a5_3 = _AVX512_LOAD(&q[(ldq)+2*offset]);
1519 0 : __AVX512_DATATYPE a6_3 = _AVX512_LOAD(&q[0+2*offset]);
1520 :
1521 : // register __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
1522 0 : __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
1523 :
1524 0 : t3 = _AVX512_FMA(a4_3, h_6_4, t3);
1525 0 : t3 = _AVX512_FMA(a3_3, h_6_3, t3);
1526 0 : t3 = _AVX512_FMA(a2_3, h_6_2, t3);
1527 0 : t3 = _AVX512_FMA(a1_3, h_6_1, t3);
1528 :
1529 : // register __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
1530 0 : __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
1531 :
1532 0 : v3 = _AVX512_FMA(a3_3, h_5_3, v3);
1533 0 : v3 = _AVX512_FMA(a2_3, h_5_2, v3);
1534 0 : v3 = _AVX512_FMA(a1_3, h_5_1, v3);
1535 :
1536 : // register __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
1537 0 : __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
1538 :
1539 0 : w3 = _AVX512_FMA(a2_3, h_4_2, w3);
1540 0 : w3 = _AVX512_FMA(a1_3, h_4_1, w3);
1541 :
1542 : // register __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
1543 0 : __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
1544 :
1545 0 : z3 = _AVX512_FMA(a1_3, h_3_1, z3);
1546 : // register __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
1547 0 : __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
1548 :
1549 :
1550 : // register __AVX512_DATATYPE x3 = a1_3;
1551 0 : __AVX512_DATATYPE x3 = a1_3;
1552 :
1553 :
1554 : __AVX512_DATATYPE q1;
1555 : __AVX512_DATATYPE q2;
1556 : __AVX512_DATATYPE q3;
1557 :
1558 : __AVX512_DATATYPE h1;
1559 : __AVX512_DATATYPE h2;
1560 : __AVX512_DATATYPE h3;
1561 : __AVX512_DATATYPE h4;
1562 : __AVX512_DATATYPE h5;
1563 : __AVX512_DATATYPE h6;
1564 :
1565 0 : for(i = 6; i < nb; i++)
1566 : {
1567 0 : h1 = _AVX512_SET1(hh[i-5]);
1568 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1569 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1570 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
1571 :
1572 0 : x1 = _AVX512_FMA(q1, h1, x1);
1573 0 : x2 = _AVX512_FMA(q2, h1, x2);
1574 0 : x3 = _AVX512_FMA(q3, h1, x3);
1575 :
1576 0 : h2 = _AVX512_SET1(hh[ldh+i-4]);
1577 :
1578 0 : y1 = _AVX512_FMA(q1, h2, y1);
1579 0 : y2 = _AVX512_FMA(q2, h2, y2);
1580 0 : y3 = _AVX512_FMA(q3, h2, y3);
1581 :
1582 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
1583 :
1584 0 : z1 = _AVX512_FMA(q1, h3, z1);
1585 0 : z2 = _AVX512_FMA(q2, h3, z2);
1586 0 : z3 = _AVX512_FMA(q3, h3, z3);
1587 :
1588 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
1589 :
1590 0 : w1 = _AVX512_FMA(q1, h4, w1);
1591 0 : w2 = _AVX512_FMA(q2, h4, w2);
1592 0 : w3 = _AVX512_FMA(q3, h4, w3);
1593 :
1594 0 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
1595 :
1596 0 : v1 = _AVX512_FMA(q1, h5, v1);
1597 0 : v2 = _AVX512_FMA(q2, h5, v2);
1598 0 : v3 = _AVX512_FMA(q3, h5, v3);
1599 :
1600 0 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
1601 :
1602 0 : t1 = _AVX512_FMA(q1, h6, t1);
1603 0 : t2 = _AVX512_FMA(q2, h6, t2);
1604 0 : t3 = _AVX512_FMA(q3, h6, t3);
1605 : }
1606 :
1607 0 : h1 = _AVX512_SET1(hh[nb-5]);
1608 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
1609 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
1610 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
1611 :
1612 0 : x1 = _AVX512_FMA(q1, h1, x1);
1613 0 : x2 = _AVX512_FMA(q2, h1, x2);
1614 0 : x3 = _AVX512_FMA(q3, h1, x3);
1615 :
1616 0 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
1617 :
1618 0 : y1 = _AVX512_FMA(q1, h2, y1);
1619 0 : y2 = _AVX512_FMA(q2, h2, y2);
1620 0 : y3 = _AVX512_FMA(q3, h2, y3);
1621 :
1622 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
1623 :
1624 0 : z1 = _AVX512_FMA(q1, h3, z1);
1625 0 : z2 = _AVX512_FMA(q2, h3, z2);
1626 0 : z3 = _AVX512_FMA(q3, h3, z3);
1627 :
1628 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
1629 :
1630 0 : w1 = _AVX512_FMA(q1, h4, w1);
1631 0 : w2 = _AVX512_FMA(q2, h4, w2);
1632 0 : w3 = _AVX512_FMA(q3, h4, w3);
1633 :
1634 0 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
1635 :
1636 0 : v1 = _AVX512_FMA(q1, h5, v1);
1637 0 : v2 = _AVX512_FMA(q2, h5, v2);
1638 0 : v3 = _AVX512_FMA(q3, h5, v3);
1639 :
1640 0 : h1 = _AVX512_SET1(hh[nb-4]);
1641 :
1642 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
1643 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
1644 0 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
1645 :
1646 0 : x1 = _AVX512_FMA(q1, h1, x1);
1647 0 : x2 = _AVX512_FMA(q2, h1, x2);
1648 0 : x3 = _AVX512_FMA(q3, h1, x3);
1649 :
1650 0 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
1651 :
1652 0 : y1 = _AVX512_FMA(q1, h2, y1);
1653 0 : y2 = _AVX512_FMA(q2, h2, y2);
1654 0 : y3 = _AVX512_FMA(q3, h2, y3);
1655 :
1656 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
1657 :
1658 0 : z1 = _AVX512_FMA(q1, h3, z1);
1659 0 : z2 = _AVX512_FMA(q2, h3, z2);
1660 0 : z3 = _AVX512_FMA(q3, h3, z3);
1661 :
1662 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
1663 :
1664 0 : w1 = _AVX512_FMA(q1, h4, w1);
1665 0 : w2 = _AVX512_FMA(q2, h4, w2);
1666 0 : w3 = _AVX512_FMA(q3, h4, w3);
1667 :
1668 0 : h1 = _AVX512_SET1(hh[nb-3]);
1669 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
1670 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1671 0 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
1672 :
1673 0 : x1 = _AVX512_FMA(q1, h1, x1);
1674 0 : x2 = _AVX512_FMA(q2, h1, x2);
1675 0 : x3 = _AVX512_FMA(q3, h1, x3);
1676 :
1677 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
1678 :
1679 0 : y1 = _AVX512_FMA(q1, h2, y1);
1680 0 : y2 = _AVX512_FMA(q2, h2, y2);
1681 0 : y3 = _AVX512_FMA(q3, h2, y3);
1682 :
1683 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1684 :
1685 0 : z1 = _AVX512_FMA(q1, h3, z1);
1686 0 : z2 = _AVX512_FMA(q2, h3, z2);
1687 0 : z3 = _AVX512_FMA(q3, h3, z3);
1688 :
1689 0 : h1 = _AVX512_SET1(hh[nb-2]);
1690 0 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
1691 0 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
1692 0 : q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
1693 :
1694 0 : x1 = _AVX512_FMA(q1, h1, x1);
1695 0 : x2 = _AVX512_FMA(q2, h1, x2);
1696 0 : x3 = _AVX512_FMA(q3, h1, x3);
1697 :
1698 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
1699 :
1700 0 : y1 = _AVX512_FMA(q1, h2, y1);
1701 0 : y2 = _AVX512_FMA(q2, h2, y2);
1702 0 : y3 = _AVX512_FMA(q3, h2, y3);
1703 :
1704 0 : h1 = _AVX512_SET1(hh[nb-1]);
1705 0 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
1706 0 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
1707 0 : q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
1708 :
1709 0 : x1 = _AVX512_FMA(q1, h1, x1);
1710 0 : x2 = _AVX512_FMA(q2, h1, x2);
1711 0 : x3 = _AVX512_FMA(q3, h1, x3);
1712 :
1713 : /////////////////////////////////////////////////////
1714 : // Apply tau, correct wrong calculation using pre-calculated scalar products
1715 : /////////////////////////////////////////////////////
1716 :
1717 0 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
1718 0 : x1 = _AVX512_MUL(x1, tau1);
1719 0 : x2 = _AVX512_MUL(x2, tau1);
1720 0 : x3 = _AVX512_MUL(x3, tau1);
1721 :
1722 0 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
1723 0 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
1724 0 : h2 = _AVX512_MUL(tau2, vs_1_2);
1725 :
1726 0 : y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
1727 0 : y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
1728 0 : y3 = _AVX512_FMSUB(y3, tau2, _AVX512_MUL(x3,h2));
1729 :
1730 0 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
1731 0 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
1732 0 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
1733 :
1734 0 : h2 = _AVX512_MUL(tau3, vs_1_3);
1735 0 : h3 = _AVX512_MUL(tau3, vs_2_3);
1736 :
1737 0 : z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
1738 0 : z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
1739 0 : z3 = _AVX512_FMSUB(z3, tau3, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
1740 :
1741 0 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
1742 0 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
1743 0 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
1744 :
1745 0 : h2 = _AVX512_MUL(tau4, vs_1_4);
1746 0 : h3 = _AVX512_MUL(tau4, vs_2_4);
1747 :
1748 0 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
1749 0 : h4 = _AVX512_MUL(tau4, vs_3_4);
1750 :
1751 0 : w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1752 0 : w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1753 0 : w3 = _AVX512_FMSUB(w3, tau4, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
1754 :
1755 0 : __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
1756 0 : __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
1757 0 : __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
1758 :
1759 0 : h2 = _AVX512_MUL(tau5, vs_1_5);
1760 0 : h3 = _AVX512_MUL(tau5, vs_2_5);
1761 :
1762 0 : __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
1763 0 : __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
1764 :
1765 0 : h4 = _AVX512_MUL(tau5, vs_3_5);
1766 0 : h5 = _AVX512_MUL(tau5, vs_4_5);
1767 :
1768 0 : v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
1769 0 : v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1770 0 : v3 = _AVX512_FMSUB(v3, tau5, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
1771 :
1772 0 : __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
1773 0 : __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
1774 0 : __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
1775 0 : h2 = _AVX512_MUL(tau6, vs_1_6);
1776 0 : h3 = _AVX512_MUL(tau6, vs_2_6);
1777 :
1778 0 : __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
1779 0 : __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
1780 0 : __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
1781 :
1782 0 : h4 = _AVX512_MUL(tau6, vs_3_6);
1783 0 : h5 = _AVX512_MUL(tau6, vs_4_6);
1784 0 : h6 = _AVX512_MUL(tau6, vs_5_6);
1785 :
1786 0 : t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
1787 0 : t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
1788 0 : t3 = _AVX512_FMSUB(t3, tau6, _AVX512_FMA(v3, h6, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)))));
1789 :
1790 : /////////////////////////////////////////////////////
1791 : // Rank-1 update of Q [8 x nb+3]
1792 : /////////////////////////////////////////////////////
1793 :
1794 0 : q1 = _AVX512_LOAD(&q[0]);
1795 0 : q2 = _AVX512_LOAD(&q[0+offset]);
1796 0 : q3 = _AVX512_LOAD(&q[0+2*offset]);
1797 :
1798 0 : q1 = _AVX512_SUB(q1, t1);
1799 0 : q2 = _AVX512_SUB(q2, t2);
1800 0 : q3 = _AVX512_SUB(q3, t3);
1801 :
1802 : _AVX512_STORE(&q[0],q1);
1803 0 : _AVX512_STORE(&q[0+offset],q2);
1804 0 : _AVX512_STORE(&q[0+2*offset],q3);
1805 :
1806 0 : h6 = _AVX512_SET1(hh[(ldh*5)+1]);
1807 0 : q1 = _AVX512_LOAD(&q[ldq]);
1808 0 : q2 = _AVX512_LOAD(&q[ldq+offset]);
1809 0 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
1810 :
1811 0 : q1 = _AVX512_SUB(q1, v1);
1812 0 : q2 = _AVX512_SUB(q2, v2);
1813 0 : q3 = _AVX512_SUB(q3, v3);
1814 :
1815 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1816 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1817 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1818 :
1819 0 : _AVX512_STORE(&q[ldq],q1);
1820 0 : _AVX512_STORE(&q[ldq+offset],q2);
1821 0 : _AVX512_STORE(&q[ldq+2*offset],q3);
1822 :
1823 0 : h5 = _AVX512_SET1(hh[(ldh*4)+1]);
1824 0 : q1 = _AVX512_LOAD(&q[ldq*2]);
1825 0 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
1826 0 : q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
1827 :
1828 0 : q1 = _AVX512_SUB(q1, w1);
1829 0 : q2 = _AVX512_SUB(q2, w2);
1830 0 : q3 = _AVX512_SUB(q3, w3);
1831 :
1832 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1833 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1834 0 : q3 = _AVX512_NFMA(v3, h5, q3);
1835 :
1836 0 : h6 = _AVX512_SET1(hh[(ldh*5)+2]);
1837 :
1838 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1839 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1840 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1841 :
1842 0 : _AVX512_STORE(&q[ldq*2],q1);
1843 0 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
1844 0 : _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
1845 :
1846 0 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
1847 0 : q1 = _AVX512_LOAD(&q[ldq*3]);
1848 0 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
1849 0 : q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
1850 :
1851 0 : q1 = _AVX512_SUB(q1, z1);
1852 0 : q2 = _AVX512_SUB(q2, z2);
1853 0 : q3 = _AVX512_SUB(q3, z3);
1854 :
1855 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1856 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1857 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1858 :
1859 0 : h5 = _AVX512_SET1(hh[(ldh*4)+2]);
1860 :
1861 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1862 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1863 0 : q3 = _AVX512_NFMA(v3, h5, q3);
1864 :
1865 0 : h6 = _AVX512_SET1(hh[(ldh*5)+3]);
1866 :
1867 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1868 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1869 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1870 :
1871 0 : _AVX512_STORE(&q[ldq*3],q1);
1872 0 : _AVX512_STORE(&q[(ldq*3)+offset],q2);
1873 0 : _AVX512_STORE(&q[(ldq*3)+2*offset],q3);
1874 :
1875 0 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
1876 0 : q1 = _AVX512_LOAD(&q[ldq*4]);
1877 0 : q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
1878 0 : q3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
1879 :
1880 0 : q1 = _AVX512_SUB(q1, y1);
1881 0 : q2 = _AVX512_SUB(q2, y2);
1882 0 : q3 = _AVX512_SUB(q3, y3);
1883 :
1884 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1885 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1886 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1887 :
1888 0 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
1889 :
1890 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1891 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1892 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1893 :
1894 0 : h5 = _AVX512_SET1(hh[(ldh*4)+3]);
1895 :
1896 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1897 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1898 0 : q3 = _AVX512_NFMA(v3, h5, q3);
1899 :
1900 0 : h6 = _AVX512_SET1(hh[(ldh*5)+4]);
1901 :
1902 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1903 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1904 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1905 :
1906 0 : _AVX512_STORE(&q[ldq*4],q1);
1907 0 : _AVX512_STORE(&q[(ldq*4)+offset],q2);
1908 0 : _AVX512_STORE(&q[(ldq*4)+2*offset],q3);
1909 :
1910 0 : h2 = _AVX512_SET1(hh[(ldh)+1]);
1911 0 : q1 = _AVX512_LOAD(&q[ldq*5]);
1912 0 : q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
1913 0 : q3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
1914 :
1915 0 : q1 = _AVX512_SUB(q1, x1);
1916 0 : q2 = _AVX512_SUB(q2, x2);
1917 0 : q3 = _AVX512_SUB(q3, x3);
1918 :
1919 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1920 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1921 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1922 :
1923 0 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
1924 :
1925 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1926 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1927 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1928 :
1929 0 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
1930 :
1931 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1932 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1933 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1934 :
1935 0 : h5 = _AVX512_SET1(hh[(ldh*4)+4]);
1936 :
1937 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1938 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1939 0 : q3 = _AVX512_NFMA(v3, h5, q3);
1940 :
1941 0 : h6 = _AVX512_SET1(hh[(ldh*5)+5]);
1942 :
1943 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1944 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1945 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1946 :
1947 0 : _AVX512_STORE(&q[ldq*5],q1);
1948 0 : _AVX512_STORE(&q[(ldq*5)+offset],q2);
1949 0 : _AVX512_STORE(&q[(ldq*5)+2*offset],q3);
1950 :
1951 0 : for (i = 6; i < nb; i++)
1952 : {
1953 0 : q1 = _AVX512_LOAD(&q[i*ldq]);
1954 0 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
1955 0 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
1956 :
1957 0 : h1 = _AVX512_SET1(hh[i-5]);
1958 :
1959 0 : q1 = _AVX512_NFMA(x1, h1, q1);
1960 0 : q2 = _AVX512_NFMA(x2, h1, q2);
1961 0 : q3 = _AVX512_NFMA(x3, h1, q3);
1962 :
1963 0 : h2 = _AVX512_SET1(hh[ldh+i-4]);
1964 :
1965 0 : q1 = _AVX512_NFMA(y1, h2, q1);
1966 0 : q2 = _AVX512_NFMA(y2, h2, q2);
1967 0 : q3 = _AVX512_NFMA(y3, h2, q3);
1968 :
1969 0 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
1970 :
1971 0 : q1 = _AVX512_NFMA(z1, h3, q1);
1972 0 : q2 = _AVX512_NFMA(z2, h3, q2);
1973 0 : q3 = _AVX512_NFMA(z3, h3, q3);
1974 :
1975 0 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
1976 :
1977 0 : q1 = _AVX512_NFMA(w1, h4, q1);
1978 0 : q2 = _AVX512_NFMA(w2, h4, q2);
1979 0 : q3 = _AVX512_NFMA(w3, h4, q3);
1980 :
1981 0 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
1982 :
1983 0 : q1 = _AVX512_NFMA(v1, h5, q1);
1984 0 : q2 = _AVX512_NFMA(v2, h5, q2);
1985 0 : q3 = _AVX512_NFMA(v3, h5, q3);
1986 :
1987 0 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
1988 :
1989 0 : q1 = _AVX512_NFMA(t1, h6, q1);
1990 0 : q2 = _AVX512_NFMA(t2, h6, q2);
1991 0 : q3 = _AVX512_NFMA(t3, h6, q3);
1992 :
1993 0 : _AVX512_STORE(&q[i*ldq],q1);
1994 0 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
1995 0 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
1996 :
1997 : }
1998 :
1999 0 : h1 = _AVX512_SET1(hh[nb-5]);
2000 0 : q1 = _AVX512_LOAD(&q[nb*ldq]);
2001 0 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
2002 0 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
2003 :
2004 0 : q1 = _AVX512_NFMA(x1, h1, q1);
2005 0 : q2 = _AVX512_NFMA(x2, h1, q2);
2006 0 : q3 = _AVX512_NFMA(x3, h1, q3);
2007 :
2008 0 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
2009 :
2010 0 : q1 = _AVX512_NFMA(y1, h2, q1);
2011 0 : q2 = _AVX512_NFMA(y2, h2, q2);
2012 0 : q3 = _AVX512_NFMA(y3, h2, q3);
2013 :
2014 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
2015 :
2016 0 : q1 = _AVX512_NFMA(z1, h3, q1);
2017 0 : q2 = _AVX512_NFMA(z2, h3, q2);
2018 0 : q3 = _AVX512_NFMA(z3, h3, q3);
2019 :
2020 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
2021 :
2022 0 : q1 = _AVX512_NFMA(w1, h4, q1);
2023 0 : q2 = _AVX512_NFMA(w2, h4, q2);
2024 0 : q3 = _AVX512_NFMA(w3, h4, q3);
2025 :
2026 0 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
2027 :
2028 0 : q1 = _AVX512_NFMA(v1, h5, q1);
2029 0 : q2 = _AVX512_NFMA(v2, h5, q2);
2030 0 : q3 = _AVX512_NFMA(v3, h5, q3);
2031 :
2032 0 : _AVX512_STORE(&q[nb*ldq],q1);
2033 0 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
2034 0 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
2035 :
2036 0 : h1 = _AVX512_SET1(hh[nb-4]);
2037 0 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
2038 0 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
2039 0 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
2040 :
2041 0 : q1 = _AVX512_NFMA(x1, h1, q1);
2042 0 : q2 = _AVX512_NFMA(x2, h1, q2);
2043 0 : q3 = _AVX512_NFMA(x3, h1, q3);
2044 :
2045 0 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
2046 :
2047 0 : q1 = _AVX512_NFMA(y1, h2, q1);
2048 0 : q2 = _AVX512_NFMA(y2, h2, q2);
2049 0 : q3 = _AVX512_NFMA(y3, h2, q3);
2050 :
2051 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
2052 :
2053 0 : q1 = _AVX512_NFMA(z1, h3, q1);
2054 0 : q2 = _AVX512_NFMA(z2, h3, q2);
2055 0 : q3 = _AVX512_NFMA(z3, h3, q3);
2056 :
2057 0 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
2058 :
2059 0 : q1 = _AVX512_NFMA(w1, h4, q1);
2060 0 : q2 = _AVX512_NFMA(w2, h4, q2);
2061 0 : q3 = _AVX512_NFMA(w3, h4, q3);
2062 :
2063 0 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
2064 0 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
2065 0 : _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
2066 :
2067 0 : h1 = _AVX512_SET1(hh[nb-3]);
2068 0 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
2069 0 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
2070 0 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
2071 :
2072 0 : q1 = _AVX512_NFMA(x1, h1, q1);
2073 0 : q2 = _AVX512_NFMA(x2, h1, q2);
2074 0 : q3 = _AVX512_NFMA(x3, h1, q3);
2075 :
2076 0 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
2077 :
2078 0 : q1 = _AVX512_NFMA(y1, h2, q1);
2079 0 : q2 = _AVX512_NFMA(y2, h2, q2);
2080 0 : q3 = _AVX512_NFMA(y3, h2, q3);
2081 :
2082 0 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
2083 :
2084 0 : q1 = _AVX512_NFMA(z1, h3, q1);
2085 0 : q2 = _AVX512_NFMA(z2, h3, q2);
2086 0 : q3 = _AVX512_NFMA(z3, h3, q3);
2087 :
2088 0 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
2089 0 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
2090 0 : _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
2091 :
2092 0 : h1 = _AVX512_SET1(hh[nb-2]);
2093 0 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
2094 0 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
2095 0 : q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
2096 :
2097 0 : q1 = _AVX512_NFMA(x1, h1, q1);
2098 0 : q2 = _AVX512_NFMA(x2, h1, q2);
2099 0 : q3 = _AVX512_NFMA(x3, h1, q3);
2100 :
2101 0 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
2102 :
2103 0 : q1 = _AVX512_NFMA(y1, h2, q1);
2104 0 : q2 = _AVX512_NFMA(y2, h2, q2);
2105 0 : q3 = _AVX512_NFMA(y3, h2, q3);
2106 :
2107 0 : _AVX512_STORE(&q[(nb+3)*ldq],q1);
2108 0 : _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
2109 0 : _AVX512_STORE(&q[((nb+3)*ldq)+2*offset],q3);
2110 :
2111 0 : h1 = _AVX512_SET1(hh[nb-1]);
2112 0 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
2113 0 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
2114 0 : q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
2115 :
2116 0 : q1 = _AVX512_NFMA(x1, h1, q1);
2117 0 : q2 = _AVX512_NFMA(x2, h1, q2);
2118 0 : q3 = _AVX512_NFMA(x3, h1, q3);
2119 :
2120 0 : _AVX512_STORE(&q[(nb+4)*ldq],q1);
2121 0 : _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
2122 0 : _AVX512_STORE(&q[((nb+4)*ldq)+2*offset],q3);
2123 :
2124 : }
2125 :
2126 : /**
2127 : * Unrolled kernel that computes
2128 : #ifdef DOUBLE_PRECISION_REAL
2129 : * 32 rows of Q simultaneously, a
2130 : #endif
2131 : #ifdef SINGLE_PRECISION_REAL
2132 : * 64 rows of Q simultaneously, a
2133 : #endif
2134 : * matrix Vector product with two householder
2135 : * vectors + a rank 1 update is performed
2136 : */
2137 : #ifdef DOUBLE_PRECISION_REAL
2138 : __forceinline void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
2139 : #endif
2140 : #ifdef SINGLE_PRECISION_REAL
2141 : __forceinline void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
2142 : #endif
2143 : {
2144 : /////////////////////////////////////////////////////
2145 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
2146 : // hh contains four householder vectors
2147 : /////////////////////////////////////////////////////
2148 : int i;
2149 :
2150 194560 : __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
2151 194560 : __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
2152 194560 : __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
2153 194560 : __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
2154 194560 : __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
2155 97280 : __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);
2156 :
2157 194560 : __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
2158 194560 : __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
2159 194560 : __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
2160 194560 : __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
2161 194560 : __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);
2162 :
2163 : // register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
2164 97280 : __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
2165 :
2166 97280 : t1 = _AVX512_FMA(a4_1, h_6_4, t1);
2167 97280 : t1 = _AVX512_FMA(a3_1, h_6_3, t1);
2168 97280 : t1 = _AVX512_FMA(a2_1, h_6_2, t1);
2169 97280 : t1 = _AVX512_FMA(a1_1, h_6_1, t1);
2170 :
2171 194560 : __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
2172 194560 : __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
2173 194560 : __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
2174 194560 : __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
2175 :
2176 : // register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
2177 97280 : __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
2178 :
2179 97280 : v1 = _AVX512_FMA(a3_1, h_5_3, v1);
2180 97280 : v1 = _AVX512_FMA(a2_1, h_5_2, v1);
2181 97280 : v1 = _AVX512_FMA(a1_1, h_5_1, v1);
2182 :
2183 194560 : __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
2184 194560 : __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
2185 194560 : __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
2186 :
2187 : // register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
2188 97280 : __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
2189 :
2190 97280 : w1 = _AVX512_FMA(a2_1, h_4_2, w1);
2191 97280 : w1 = _AVX512_FMA(a1_1, h_4_1, w1);
2192 :
2193 194560 : __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
2194 194560 : __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
2195 194560 : __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
2196 :
2197 : // register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
2198 97280 : __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
2199 :
2200 97280 : z1 = _AVX512_FMA(a1_1, h_3_1, z1);
2201 : // register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
2202 97280 : __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
2203 :
2204 :
2205 : // register __AVX512_DATATYPE x1 = a1_1;
2206 97280 : __AVX512_DATATYPE x1 = a1_1;
2207 :
2208 :
2209 :
2210 194560 : __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
2211 194560 : __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
2212 194560 : __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
2213 194560 : __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
2214 194560 : __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
2215 194560 : __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
2216 :
2217 : // register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
2218 97280 : __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
2219 :
2220 97280 : t2 = _AVX512_FMA(a4_2, h_6_4, t2);
2221 97280 : t2 = _AVX512_FMA(a3_2, h_6_3, t2);
2222 97280 : t2 = _AVX512_FMA(a2_2, h_6_2, t2);
2223 97280 : t2 = _AVX512_FMA(a1_2, h_6_1, t2);
2224 :
2225 : // register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
2226 97280 : __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
2227 :
2228 97280 : v2 = _AVX512_FMA(a3_2, h_5_3, v2);
2229 97280 : v2 = _AVX512_FMA(a2_2, h_5_2, v2);
2230 97280 : v2 = _AVX512_FMA(a1_2, h_5_1, v2);
2231 :
2232 : // register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
2233 97280 : __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
2234 :
2235 97280 : w2 = _AVX512_FMA(a2_2, h_4_2, w2);
2236 97280 : w2 = _AVX512_FMA(a1_2, h_4_1, w2);
2237 :
2238 : // register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
2239 97280 : __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
2240 :
2241 97280 : z2 = _AVX512_FMA(a1_2, h_3_1, z2);
2242 : // register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
2243 97280 : __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
2244 :
2245 :
2246 : // register __AVX512_DATATYPE x2 = a1_2;
2247 97280 : __AVX512_DATATYPE x2 = a1_2;
2248 :
2249 :
2250 194560 : __AVX512_DATATYPE a1_3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
2251 194560 : __AVX512_DATATYPE a2_3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
2252 194560 : __AVX512_DATATYPE a3_3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
2253 194560 : __AVX512_DATATYPE a4_3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
2254 194560 : __AVX512_DATATYPE a5_3 = _AVX512_LOAD(&q[(ldq)+2*offset]);
2255 194560 : __AVX512_DATATYPE a6_3 = _AVX512_LOAD(&q[0+2*offset]);
2256 :
2257 : // register __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
2258 97280 : __AVX512_DATATYPE t3 = _AVX512_FMA(a5_3, h_6_5, a6_3);
2259 :
2260 97280 : t3 = _AVX512_FMA(a4_3, h_6_4, t3);
2261 97280 : t3 = _AVX512_FMA(a3_3, h_6_3, t3);
2262 97280 : t3 = _AVX512_FMA(a2_3, h_6_2, t3);
2263 97280 : t3 = _AVX512_FMA(a1_3, h_6_1, t3);
2264 :
2265 : // register __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
2266 97280 : __AVX512_DATATYPE v3 = _AVX512_FMA(a4_3, h_5_4, a5_3);
2267 :
2268 97280 : v3 = _AVX512_FMA(a3_3, h_5_3, v3);
2269 97280 : v3 = _AVX512_FMA(a2_3, h_5_2, v3);
2270 97280 : v3 = _AVX512_FMA(a1_3, h_5_1, v3);
2271 :
2272 : // register __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
2273 97280 : __AVX512_DATATYPE w3 = _AVX512_FMA(a3_3, h_4_3, a4_3);
2274 :
2275 97280 : w3 = _AVX512_FMA(a2_3, h_4_2, w3);
2276 97280 : w3 = _AVX512_FMA(a1_3, h_4_1, w3);
2277 :
2278 : // register __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
2279 97280 : __AVX512_DATATYPE z3 = _AVX512_FMA(a2_3, h_3_2, a3_3);
2280 :
2281 97280 : z3 = _AVX512_FMA(a1_3, h_3_1, z3);
2282 : // register __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
2283 97280 : __AVX512_DATATYPE y3 = _AVX512_FMA(a1_3, h_2_1, a2_3);
2284 :
2285 :
2286 : // register __AVX512_DATATYPE x3 = a1_3;
2287 97280 : __AVX512_DATATYPE x3 = a1_3;
2288 :
2289 :
2290 194560 : __AVX512_DATATYPE a1_4 = _AVX512_LOAD(&q[(ldq*5)+3*offset]);
2291 194560 : __AVX512_DATATYPE a2_4 = _AVX512_LOAD(&q[(ldq*4)+3*offset]);
2292 194560 : __AVX512_DATATYPE a3_4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
2293 194560 : __AVX512_DATATYPE a4_4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
2294 194560 : __AVX512_DATATYPE a5_4 = _AVX512_LOAD(&q[(ldq)+3*offset]);
2295 194560 : __AVX512_DATATYPE a6_4 = _AVX512_LOAD(&q[0+3*offset]);
2296 :
2297 : // register __AVX512_DATATYPE t4 = _AVX512_FMA(a5_4, h_6_5, a6_4);
2298 97280 : __AVX512_DATATYPE t4 = _AVX512_FMA(a5_4, h_6_5, a6_4);
2299 :
2300 97280 : t4 = _AVX512_FMA(a4_4, h_6_4, t4);
2301 97280 : t4 = _AVX512_FMA(a3_4, h_6_3, t4);
2302 97280 : t4 = _AVX512_FMA(a2_4, h_6_2, t4);
2303 97280 : t4 = _AVX512_FMA(a1_4, h_6_1, t4);
2304 :
2305 : // register __AVX512_DATATYPE v4 = _AVX512_FMA(a4_4, h_5_4, a5_4);
2306 97280 : __AVX512_DATATYPE v4 = _AVX512_FMA(a4_4, h_5_4, a5_4);
2307 :
2308 97280 : v4 = _AVX512_FMA(a3_4, h_5_3, v4);
2309 97280 : v4 = _AVX512_FMA(a2_4, h_5_2, v4);
2310 97280 : v4 = _AVX512_FMA(a1_4, h_5_1, v4);
2311 :
2312 : // register __AVX512_DATATYPE w4 = _AVX512_FMA(a3_4, h_4_3, a4_4);
2313 97280 : __AVX512_DATATYPE w4 = _AVX512_FMA(a3_4, h_4_3, a4_4);
2314 :
2315 97280 : w4 = _AVX512_FMA(a2_4, h_4_2, w4);
2316 97280 : w4 = _AVX512_FMA(a1_4, h_4_1, w4);
2317 :
2318 : // register __AVX512_DATATYPE z4 = _AVX512_FMA(a2_4, h_3_2, a3_4);
2319 97280 : __AVX512_DATATYPE z4 = _AVX512_FMA(a2_4, h_3_2, a3_4);
2320 :
2321 97280 : z4 = _AVX512_FMA(a1_4, h_3_1, z4);
2322 : // register __AVX512_DATATYPE y4 = _AVX512_FMA(a1_4, h_2_1, a2_4);
2323 97280 : __AVX512_DATATYPE y4 = _AVX512_FMA(a1_4, h_2_1, a2_4);
2324 :
2325 :
2326 : // register __AVX512_DATATYPE x4 = a1_4;
2327 97280 : __AVX512_DATATYPE x4 = a1_4;
2328 :
2329 :
2330 : __AVX512_DATATYPE q1;
2331 : __AVX512_DATATYPE q2;
2332 : __AVX512_DATATYPE q3;
2333 : __AVX512_DATATYPE q4;
2334 :
2335 : __AVX512_DATATYPE h1;
2336 : __AVX512_DATATYPE h2;
2337 : __AVX512_DATATYPE h3;
2338 : __AVX512_DATATYPE h4;
2339 : __AVX512_DATATYPE h5;
2340 : __AVX512_DATATYPE h6;
2341 :
2342 5739520 : for(i = 6; i < nb; i++)
2343 : {
2344 11284480 : h1 = _AVX512_SET1(hh[i-5]);
2345 11284480 : q1 = _AVX512_LOAD(&q[i*ldq]);
2346 11284480 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
2347 11284480 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
2348 11284480 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
2349 :
2350 5642240 : x1 = _AVX512_FMA(q1, h1, x1);
2351 5642240 : x2 = _AVX512_FMA(q2, h1, x2);
2352 5642240 : x3 = _AVX512_FMA(q3, h1, x3);
2353 5642240 : x4 = _AVX512_FMA(q4, h1, x4);
2354 :
2355 11284480 : h2 = _AVX512_SET1(hh[ldh+i-4]);
2356 :
2357 5642240 : y1 = _AVX512_FMA(q1, h2, y1);
2358 5642240 : y2 = _AVX512_FMA(q2, h2, y2);
2359 5642240 : y3 = _AVX512_FMA(q3, h2, y3);
2360 5642240 : y4 = _AVX512_FMA(q4, h2, y4);
2361 :
2362 11284480 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
2363 :
2364 5642240 : z1 = _AVX512_FMA(q1, h3, z1);
2365 5642240 : z2 = _AVX512_FMA(q2, h3, z2);
2366 5642240 : z3 = _AVX512_FMA(q3, h3, z3);
2367 5642240 : z4 = _AVX512_FMA(q4, h3, z4);
2368 :
2369 11284480 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
2370 :
2371 5642240 : w1 = _AVX512_FMA(q1, h4, w1);
2372 5642240 : w2 = _AVX512_FMA(q2, h4, w2);
2373 5642240 : w3 = _AVX512_FMA(q3, h4, w3);
2374 5642240 : w4 = _AVX512_FMA(q4, h4, w4);
2375 :
2376 11284480 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
2377 :
2378 5642240 : v1 = _AVX512_FMA(q1, h5, v1);
2379 5642240 : v2 = _AVX512_FMA(q2, h5, v2);
2380 5642240 : v3 = _AVX512_FMA(q3, h5, v3);
2381 5642240 : v4 = _AVX512_FMA(q4, h5, v4);
2382 :
2383 11284480 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
2384 :
2385 5642240 : t1 = _AVX512_FMA(q1, h6, t1);
2386 5642240 : t2 = _AVX512_FMA(q2, h6, t2);
2387 5642240 : t3 = _AVX512_FMA(q3, h6, t3);
2388 5642240 : t4 = _AVX512_FMA(q4, h6, t4);
2389 : }
2390 :
2391 194560 : h1 = _AVX512_SET1(hh[nb-5]);
2392 194560 : q1 = _AVX512_LOAD(&q[nb*ldq]);
2393 194560 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
2394 194560 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
2395 194560 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
2396 :
2397 97280 : x1 = _AVX512_FMA(q1, h1, x1);
2398 97280 : x2 = _AVX512_FMA(q2, h1, x2);
2399 97280 : x3 = _AVX512_FMA(q3, h1, x3);
2400 97280 : x4 = _AVX512_FMA(q4, h1, x4);
2401 :
2402 194560 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
2403 :
2404 97280 : y1 = _AVX512_FMA(q1, h2, y1);
2405 97280 : y2 = _AVX512_FMA(q2, h2, y2);
2406 97280 : y3 = _AVX512_FMA(q3, h2, y3);
2407 97280 : y4 = _AVX512_FMA(q4, h2, y4);
2408 :
2409 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
2410 :
2411 97280 : z1 = _AVX512_FMA(q1, h3, z1);
2412 97280 : z2 = _AVX512_FMA(q2, h3, z2);
2413 97280 : z3 = _AVX512_FMA(q3, h3, z3);
2414 97280 : z4 = _AVX512_FMA(q4, h3, z4);
2415 :
2416 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
2417 :
2418 97280 : w1 = _AVX512_FMA(q1, h4, w1);
2419 97280 : w2 = _AVX512_FMA(q2, h4, w2);
2420 97280 : w3 = _AVX512_FMA(q3, h4, w3);
2421 97280 : w4 = _AVX512_FMA(q4, h4, w4);
2422 :
2423 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
2424 :
2425 97280 : v1 = _AVX512_FMA(q1, h5, v1);
2426 97280 : v2 = _AVX512_FMA(q2, h5, v2);
2427 97280 : v3 = _AVX512_FMA(q3, h5, v3);
2428 97280 : v4 = _AVX512_FMA(q4, h5, v4);
2429 :
2430 194560 : h1 = _AVX512_SET1(hh[nb-4]);
2431 :
2432 194560 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
2433 194560 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
2434 194560 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
2435 194560 : q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
2436 :
2437 97280 : x1 = _AVX512_FMA(q1, h1, x1);
2438 97280 : x2 = _AVX512_FMA(q2, h1, x2);
2439 97280 : x3 = _AVX512_FMA(q3, h1, x3);
2440 97280 : x4 = _AVX512_FMA(q4, h1, x4);
2441 :
2442 194560 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
2443 :
2444 97280 : y1 = _AVX512_FMA(q1, h2, y1);
2445 97280 : y2 = _AVX512_FMA(q2, h2, y2);
2446 97280 : y3 = _AVX512_FMA(q3, h2, y3);
2447 97280 : y4 = _AVX512_FMA(q4, h2, y4);
2448 :
2449 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
2450 :
2451 97280 : z1 = _AVX512_FMA(q1, h3, z1);
2452 97280 : z2 = _AVX512_FMA(q2, h3, z2);
2453 97280 : z3 = _AVX512_FMA(q3, h3, z3);
2454 97280 : z4 = _AVX512_FMA(q4, h3, z4);
2455 :
2456 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
2457 :
2458 97280 : w1 = _AVX512_FMA(q1, h4, w1);
2459 97280 : w2 = _AVX512_FMA(q2, h4, w2);
2460 97280 : w3 = _AVX512_FMA(q3, h4, w3);
2461 97280 : w4 = _AVX512_FMA(q4, h4, w4);
2462 :
2463 194560 : h1 = _AVX512_SET1(hh[nb-3]);
2464 194560 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
2465 194560 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
2466 194560 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
2467 194560 : q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
2468 :
2469 97280 : x1 = _AVX512_FMA(q1, h1, x1);
2470 97280 : x2 = _AVX512_FMA(q2, h1, x2);
2471 97280 : x3 = _AVX512_FMA(q3, h1, x3);
2472 97280 : x4 = _AVX512_FMA(q4, h1, x4);
2473 :
2474 194560 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
2475 :
2476 97280 : y1 = _AVX512_FMA(q1, h2, y1);
2477 97280 : y2 = _AVX512_FMA(q2, h2, y2);
2478 97280 : y3 = _AVX512_FMA(q3, h2, y3);
2479 97280 : y4 = _AVX512_FMA(q4, h2, y4);
2480 :
2481 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
2482 :
2483 97280 : z1 = _AVX512_FMA(q1, h3, z1);
2484 97280 : z2 = _AVX512_FMA(q2, h3, z2);
2485 97280 : z3 = _AVX512_FMA(q3, h3, z3);
2486 97280 : z4 = _AVX512_FMA(q4, h3, z4);
2487 :
2488 194560 : h1 = _AVX512_SET1(hh[nb-2]);
2489 194560 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
2490 194560 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
2491 194560 : q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
2492 194560 : q4 = _AVX512_LOAD(&q[((nb+3)*ldq)+3*offset]);
2493 :
2494 97280 : x1 = _AVX512_FMA(q1, h1, x1);
2495 97280 : x2 = _AVX512_FMA(q2, h1, x2);
2496 97280 : x3 = _AVX512_FMA(q3, h1, x3);
2497 97280 : x4 = _AVX512_FMA(q4, h1, x4);
2498 :
2499 194560 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
2500 :
2501 97280 : y1 = _AVX512_FMA(q1, h2, y1);
2502 97280 : y2 = _AVX512_FMA(q2, h2, y2);
2503 97280 : y3 = _AVX512_FMA(q3, h2, y3);
2504 97280 : y4 = _AVX512_FMA(q4, h2, y4);
2505 :
2506 194560 : h1 = _AVX512_SET1(hh[nb-1]);
2507 194560 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
2508 194560 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
2509 194560 : q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
2510 194560 : q4 = _AVX512_LOAD(&q[((nb+4)*ldq)+3*offset]);
2511 :
2512 97280 : x1 = _AVX512_FMA(q1, h1, x1);
2513 97280 : x2 = _AVX512_FMA(q2, h1, x2);
2514 97280 : x3 = _AVX512_FMA(q3, h1, x3);
2515 97280 : x4 = _AVX512_FMA(q4, h1, x4);
2516 :
2517 : /////////////////////////////////////////////////////
2518 : // Apply tau, correct wrong calculation using pre-calculated scalar products
2519 : /////////////////////////////////////////////////////
2520 :
2521 194560 : __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
2522 97280 : x1 = _AVX512_MUL(x1, tau1);
2523 97280 : x2 = _AVX512_MUL(x2, tau1);
2524 97280 : x3 = _AVX512_MUL(x3, tau1);
2525 97280 : x4 = _AVX512_MUL(x4, tau1);
2526 :
2527 194560 : __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
2528 194560 : __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
2529 97280 : h2 = _AVX512_MUL(tau2, vs_1_2);
2530 :
2531 194560 : y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
2532 194560 : y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
2533 194560 : y3 = _AVX512_FMSUB(y3, tau2, _AVX512_MUL(x3,h2));
2534 194560 : y4 = _AVX512_FMSUB(y4, tau2, _AVX512_MUL(x4,h2));
2535 :
2536 194560 : __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
2537 194560 : __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
2538 194560 : __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
2539 :
2540 97280 : h2 = _AVX512_MUL(tau3, vs_1_3);
2541 97280 : h3 = _AVX512_MUL(tau3, vs_2_3);
2542 :
2543 291840 : z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
2544 291840 : z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
2545 291840 : z3 = _AVX512_FMSUB(z3, tau3, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)));
2546 291840 : z4 = _AVX512_FMSUB(z4, tau3, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2)));
2547 :
2548 194560 : __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
2549 194560 : __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
2550 194560 : __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
2551 :
2552 97280 : h2 = _AVX512_MUL(tau4, vs_1_4);
2553 97280 : h3 = _AVX512_MUL(tau4, vs_2_4);
2554 :
2555 194560 : __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
2556 97280 : h4 = _AVX512_MUL(tau4, vs_3_4);
2557 :
2558 389120 : w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
2559 389120 : w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
2560 389120 : w3 = _AVX512_FMSUB(w3, tau4, _AVX512_FMA(z3, h4, _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
2561 389120 : w4 = _AVX512_FMSUB(w4, tau4, _AVX512_FMA(z4, h4, _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2))));
2562 :
2563 194560 : __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
2564 194560 : __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
2565 194560 : __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
2566 :
2567 97280 : h2 = _AVX512_MUL(tau5, vs_1_5);
2568 97280 : h3 = _AVX512_MUL(tau5, vs_2_5);
2569 :
2570 194560 : __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
2571 194560 : __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
2572 :
2573 97280 : h4 = _AVX512_MUL(tau5, vs_3_5);
2574 97280 : h5 = _AVX512_MUL(tau5, vs_4_5);
2575 :
2576 583680 : v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
2577 583680 : v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
2578 583680 : v3 = _AVX512_FMSUB(v3, tau5, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2))));
2579 583680 : v4 = _AVX512_FMSUB(v4, tau5, _AVX512_ADD(_AVX512_FMA(w4, h5, _AVX512_MUL(z4,h4)), _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2))));
2580 :
2581 194560 : __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
2582 194560 : __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
2583 194560 : __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
2584 97280 : h2 = _AVX512_MUL(tau6, vs_1_6);
2585 97280 : h3 = _AVX512_MUL(tau6, vs_2_6);
2586 :
2587 194560 : __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
2588 194560 : __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
2589 194560 : __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
2590 :
2591 97280 : h4 = _AVX512_MUL(tau6, vs_3_6);
2592 97280 : h5 = _AVX512_MUL(tau6, vs_4_6);
2593 97280 : h6 = _AVX512_MUL(tau6, vs_5_6);
2594 :
2595 680960 : t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
2596 680960 : t2 = _AVX512_FMSUB(t2, tau6, _AVX512_FMA(v2, h6, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)))));
2597 680960 : t3 = _AVX512_FMSUB(t3, tau6, _AVX512_FMA(v3, h6, _AVX512_ADD(_AVX512_FMA(w3, h5, _AVX512_MUL(z3,h4)), _AVX512_FMA(y3, h3, _AVX512_MUL(x3,h2)))));
2598 680960 : t4 = _AVX512_FMSUB(t4, tau6, _AVX512_FMA(v4, h6, _AVX512_ADD(_AVX512_FMA(w4, h5, _AVX512_MUL(z4,h4)), _AVX512_FMA(y4, h3, _AVX512_MUL(x4,h2)))));
2599 :
2600 :
2601 : /////////////////////////////////////////////////////
2602 : // Rank-1 update of Q [8 x nb+3]
2603 : /////////////////////////////////////////////////////
2604 :
2605 97280 : q1 = _AVX512_LOAD(&q[0]);
2606 194560 : q2 = _AVX512_LOAD(&q[0+offset]);
2607 194560 : q3 = _AVX512_LOAD(&q[0+2*offset]);
2608 194560 : q4 = _AVX512_LOAD(&q[0+3*offset]);
2609 :
2610 97280 : q1 = _AVX512_SUB(q1, t1);
2611 97280 : q2 = _AVX512_SUB(q2, t2);
2612 97280 : q3 = _AVX512_SUB(q3, t3);
2613 97280 : q4 = _AVX512_SUB(q4, t4);
2614 :
2615 : _AVX512_STORE(&q[0],q1);
2616 97280 : _AVX512_STORE(&q[0+offset],q2);
2617 97280 : _AVX512_STORE(&q[0+2*offset],q3);
2618 97280 : _AVX512_STORE(&q[0+3*offset],q4);
2619 :
2620 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+1]);
2621 194560 : q1 = _AVX512_LOAD(&q[ldq]);
2622 194560 : q2 = _AVX512_LOAD(&q[ldq+offset]);
2623 194560 : q3 = _AVX512_LOAD(&q[ldq+2*offset]);
2624 194560 : q4 = _AVX512_LOAD(&q[ldq+3*offset]);
2625 :
2626 97280 : q1 = _AVX512_SUB(q1, v1);
2627 97280 : q2 = _AVX512_SUB(q2, v2);
2628 97280 : q3 = _AVX512_SUB(q3, v3);
2629 97280 : q4 = _AVX512_SUB(q4, v4);
2630 :
2631 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
2632 97280 : q2 = _AVX512_NFMA(t2, h6, q2);
2633 97280 : q3 = _AVX512_NFMA(t3, h6, q3);
2634 97280 : q4 = _AVX512_NFMA(t4, h6, q4);
2635 :
2636 97280 : _AVX512_STORE(&q[ldq],q1);
2637 97280 : _AVX512_STORE(&q[ldq+offset],q2);
2638 97280 : _AVX512_STORE(&q[ldq+2*offset],q3);
2639 97280 : _AVX512_STORE(&q[ldq+3*offset],q4);
2640 :
2641 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+1]);
2642 194560 : q1 = _AVX512_LOAD(&q[ldq*2]);
2643 194560 : q2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
2644 194560 : q3 = _AVX512_LOAD(&q[(ldq*2)+2*offset]);
2645 194560 : q4 = _AVX512_LOAD(&q[(ldq*2)+3*offset]);
2646 :
2647 97280 : q1 = _AVX512_SUB(q1, w1);
2648 97280 : q2 = _AVX512_SUB(q2, w2);
2649 97280 : q3 = _AVX512_SUB(q3, w3);
2650 97280 : q4 = _AVX512_SUB(q4, w4);
2651 :
2652 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
2653 97280 : q2 = _AVX512_NFMA(v2, h5, q2);
2654 97280 : q3 = _AVX512_NFMA(v3, h5, q3);
2655 97280 : q4 = _AVX512_NFMA(v4, h5, q4);
2656 :
2657 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+2]);
2658 :
2659 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
2660 97280 : q2 = _AVX512_NFMA(t2, h6, q2);
2661 97280 : q3 = _AVX512_NFMA(t3, h6, q3);
2662 97280 : q4 = _AVX512_NFMA(t4, h6, q4);
2663 :
2664 97280 : _AVX512_STORE(&q[ldq*2],q1);
2665 97280 : _AVX512_STORE(&q[(ldq*2)+offset],q2);
2666 97280 : _AVX512_STORE(&q[(ldq*2)+2*offset],q3);
2667 97280 : _AVX512_STORE(&q[(ldq*2)+3*offset],q4);
2668 :
2669 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+1]);
2670 194560 : q1 = _AVX512_LOAD(&q[ldq*3]);
2671 194560 : q2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
2672 194560 : q3 = _AVX512_LOAD(&q[(ldq*3)+2*offset]);
2673 194560 : q4 = _AVX512_LOAD(&q[(ldq*3)+3*offset]);
2674 :
2675 97280 : q1 = _AVX512_SUB(q1, z1);
2676 97280 : q2 = _AVX512_SUB(q2, z2);
2677 97280 : q3 = _AVX512_SUB(q3, z3);
2678 97280 : q4 = _AVX512_SUB(q4, z4);
2679 :
2680 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
2681 97280 : q2 = _AVX512_NFMA(w2, h4, q2);
2682 97280 : q3 = _AVX512_NFMA(w3, h4, q3);
2683 97280 : q4 = _AVX512_NFMA(w4, h4, q4);
2684 :
2685 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+2]);
2686 :
2687 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
2688 97280 : q2 = _AVX512_NFMA(v2, h5, q2);
2689 97280 : q3 = _AVX512_NFMA(v3, h5, q3);
2690 97280 : q4 = _AVX512_NFMA(v4, h5, q4);
2691 :
2692 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+3]);
2693 :
2694 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
2695 97280 : q2 = _AVX512_NFMA(t2, h6, q2);
2696 97280 : q3 = _AVX512_NFMA(t3, h6, q3);
2697 97280 : q4 = _AVX512_NFMA(t4, h6, q4);
2698 :
2699 97280 : _AVX512_STORE(&q[ldq*3],q1);
2700 97280 : _AVX512_STORE(&q[(ldq*3)+offset],q2);
2701 97280 : _AVX512_STORE(&q[(ldq*3)+2*offset],q3);
2702 97280 : _AVX512_STORE(&q[(ldq*3)+3*offset],q4);
2703 :
2704 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+1]);
2705 194560 : q1 = _AVX512_LOAD(&q[ldq*4]);
2706 194560 : q2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
2707 194560 : q3 = _AVX512_LOAD(&q[(ldq*4)+2*offset]);
2708 194560 : q4 = _AVX512_LOAD(&q[(ldq*4)+3*offset]);
2709 :
2710 97280 : q1 = _AVX512_SUB(q1, y1);
2711 97280 : q2 = _AVX512_SUB(q2, y2);
2712 97280 : q3 = _AVX512_SUB(q3, y3);
2713 97280 : q4 = _AVX512_SUB(q4, y4);
2714 :
2715 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
2716 97280 : q2 = _AVX512_NFMA(z2, h3, q2);
2717 97280 : q3 = _AVX512_NFMA(z3, h3, q3);
2718 97280 : q4 = _AVX512_NFMA(z4, h3, q4);
2719 :
2720 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+2]);
2721 :
2722 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
2723 97280 : q2 = _AVX512_NFMA(w2, h4, q2);
2724 97280 : q3 = _AVX512_NFMA(w3, h4, q3);
2725 97280 : q4 = _AVX512_NFMA(w4, h4, q4);
2726 :
2727 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+3]);
2728 :
2729 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
2730 97280 : q2 = _AVX512_NFMA(v2, h5, q2);
2731 97280 : q3 = _AVX512_NFMA(v3, h5, q3);
2732 97280 : q4 = _AVX512_NFMA(v4, h5, q4);
2733 :
2734 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+4]);
2735 :
2736 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
2737 97280 : q2 = _AVX512_NFMA(t2, h6, q2);
2738 97280 : q3 = _AVX512_NFMA(t3, h6, q3);
2739 97280 : q4 = _AVX512_NFMA(t4, h6, q4);
2740 :
2741 97280 : _AVX512_STORE(&q[ldq*4],q1);
2742 97280 : _AVX512_STORE(&q[(ldq*4)+offset],q2);
2743 97280 : _AVX512_STORE(&q[(ldq*4)+2*offset],q3);
2744 97280 : _AVX512_STORE(&q[(ldq*4)+3*offset],q4);
2745 :
2746 194560 : h2 = _AVX512_SET1(hh[(ldh)+1]);
2747 194560 : q1 = _AVX512_LOAD(&q[ldq*5]);
2748 194560 : q2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
2749 194560 : q3 = _AVX512_LOAD(&q[(ldq*5)+2*offset]);
2750 194560 : q4 = _AVX512_LOAD(&q[(ldq*5)+3*offset]);
2751 :
2752 97280 : q1 = _AVX512_SUB(q1, x1);
2753 97280 : q2 = _AVX512_SUB(q2, x2);
2754 97280 : q3 = _AVX512_SUB(q3, x3);
2755 97280 : q4 = _AVX512_SUB(q4, x4);
2756 :
2757 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
2758 97280 : q2 = _AVX512_NFMA(y2, h2, q2);
2759 97280 : q3 = _AVX512_NFMA(y3, h2, q3);
2760 97280 : q4 = _AVX512_NFMA(y4, h2, q4);
2761 :
2762 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+2]);
2763 :
2764 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
2765 97280 : q2 = _AVX512_NFMA(z2, h3, q2);
2766 97280 : q3 = _AVX512_NFMA(z3, h3, q3);
2767 97280 : q4 = _AVX512_NFMA(z4, h3, q4);
2768 :
2769 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+3]);
2770 :
2771 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
2772 97280 : q2 = _AVX512_NFMA(w2, h4, q2);
2773 97280 : q3 = _AVX512_NFMA(w3, h4, q3);
2774 97280 : q4 = _AVX512_NFMA(w4, h4, q4);
2775 :
2776 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+4]);
2777 :
2778 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
2779 97280 : q2 = _AVX512_NFMA(v2, h5, q2);
2780 97280 : q3 = _AVX512_NFMA(v3, h5, q3);
2781 97280 : q4 = _AVX512_NFMA(v4, h5, q4);
2782 :
2783 194560 : h6 = _AVX512_SET1(hh[(ldh*5)+5]);
2784 :
2785 97280 : q1 = _AVX512_NFMA(t1, h6, q1);
2786 97280 : q2 = _AVX512_NFMA(t2, h6, q2);
2787 97280 : q3 = _AVX512_NFMA(t3, h6, q3);
2788 97280 : q4 = _AVX512_NFMA(t4, h6, q4);
2789 :
2790 97280 : _AVX512_STORE(&q[ldq*5],q1);
2791 97280 : _AVX512_STORE(&q[(ldq*5)+offset],q2);
2792 97280 : _AVX512_STORE(&q[(ldq*5)+2*offset],q3);
2793 97280 : _AVX512_STORE(&q[(ldq*5)+3*offset],q4);
2794 :
2795 5739520 : for (i = 6; i < nb; i++)
2796 : {
2797 11284480 : q1 = _AVX512_LOAD(&q[i*ldq]);
2798 11284480 : q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
2799 11284480 : q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
2800 11284480 : q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
2801 :
2802 11284480 : h1 = _AVX512_SET1(hh[i-5]);
2803 :
2804 5642240 : q1 = _AVX512_NFMA(x1, h1, q1);
2805 5642240 : q2 = _AVX512_NFMA(x2, h1, q2);
2806 5642240 : q3 = _AVX512_NFMA(x3, h1, q3);
2807 5642240 : q4 = _AVX512_NFMA(x4, h1, q4);
2808 :
2809 11284480 : h2 = _AVX512_SET1(hh[ldh+i-4]);
2810 :
2811 5642240 : q1 = _AVX512_NFMA(y1, h2, q1);
2812 5642240 : q2 = _AVX512_NFMA(y2, h2, q2);
2813 5642240 : q3 = _AVX512_NFMA(y3, h2, q3);
2814 5642240 : q4 = _AVX512_NFMA(y4, h2, q4);
2815 :
2816 11284480 : h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
2817 :
2818 5642240 : q1 = _AVX512_NFMA(z1, h3, q1);
2819 5642240 : q2 = _AVX512_NFMA(z2, h3, q2);
2820 5642240 : q3 = _AVX512_NFMA(z3, h3, q3);
2821 5642240 : q4 = _AVX512_NFMA(z4, h3, q4);
2822 :
2823 11284480 : h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
2824 :
2825 5642240 : q1 = _AVX512_NFMA(w1, h4, q1);
2826 5642240 : q2 = _AVX512_NFMA(w2, h4, q2);
2827 5642240 : q3 = _AVX512_NFMA(w3, h4, q3);
2828 5642240 : q4 = _AVX512_NFMA(w4, h4, q4);
2829 :
2830 11284480 : h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
2831 :
2832 5642240 : q1 = _AVX512_NFMA(v1, h5, q1);
2833 5642240 : q2 = _AVX512_NFMA(v2, h5, q2);
2834 5642240 : q3 = _AVX512_NFMA(v3, h5, q3);
2835 5642240 : q4 = _AVX512_NFMA(v4, h5, q4);
2836 :
2837 11284480 : h6 = _AVX512_SET1(hh[(ldh*5)+i]);
2838 :
2839 5642240 : q1 = _AVX512_NFMA(t1, h6, q1);
2840 5642240 : q2 = _AVX512_NFMA(t2, h6, q2);
2841 5642240 : q3 = _AVX512_NFMA(t3, h6, q3);
2842 5642240 : q4 = _AVX512_NFMA(t4, h6, q4);
2843 :
2844 5642240 : _AVX512_STORE(&q[i*ldq],q1);
2845 5642240 : _AVX512_STORE(&q[(i*ldq)+offset],q2);
2846 5642240 : _AVX512_STORE(&q[(i*ldq)+2*offset],q3);
2847 5642240 : _AVX512_STORE(&q[(i*ldq)+3*offset],q4);
2848 :
2849 : }
2850 :
2851 194560 : h1 = _AVX512_SET1(hh[nb-5]);
2852 194560 : q1 = _AVX512_LOAD(&q[nb*ldq]);
2853 194560 : q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
2854 194560 : q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
2855 194560 : q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
2856 :
2857 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
2858 97280 : q2 = _AVX512_NFMA(x2, h1, q2);
2859 97280 : q3 = _AVX512_NFMA(x3, h1, q3);
2860 97280 : q4 = _AVX512_NFMA(x4, h1, q4);
2861 :
2862 194560 : h2 = _AVX512_SET1(hh[ldh+nb-4]);
2863 :
2864 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
2865 97280 : q2 = _AVX512_NFMA(y2, h2, q2);
2866 97280 : q3 = _AVX512_NFMA(y3, h2, q3);
2867 97280 : q4 = _AVX512_NFMA(y4, h2, q4);
2868 :
2869 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
2870 :
2871 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
2872 97280 : q2 = _AVX512_NFMA(z2, h3, q2);
2873 97280 : q3 = _AVX512_NFMA(z3, h3, q3);
2874 97280 : q4 = _AVX512_NFMA(z4, h3, q4);
2875 :
2876 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
2877 :
2878 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
2879 97280 : q2 = _AVX512_NFMA(w2, h4, q2);
2880 97280 : q3 = _AVX512_NFMA(w3, h4, q3);
2881 97280 : q4 = _AVX512_NFMA(w4, h4, q4);
2882 :
2883 194560 : h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
2884 :
2885 97280 : q1 = _AVX512_NFMA(v1, h5, q1);
2886 97280 : q2 = _AVX512_NFMA(v2, h5, q2);
2887 97280 : q3 = _AVX512_NFMA(v3, h5, q3);
2888 97280 : q4 = _AVX512_NFMA(v4, h5, q4);
2889 :
2890 97280 : _AVX512_STORE(&q[nb*ldq],q1);
2891 97280 : _AVX512_STORE(&q[(nb*ldq)+offset],q2);
2892 97280 : _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
2893 97280 : _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
2894 :
2895 194560 : h1 = _AVX512_SET1(hh[nb-4]);
2896 194560 : q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
2897 194560 : q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
2898 194560 : q3 = _AVX512_LOAD(&q[((nb+1)*ldq)+2*offset]);
2899 194560 : q4 = _AVX512_LOAD(&q[((nb+1)*ldq)+3*offset]);
2900 :
2901 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
2902 97280 : q2 = _AVX512_NFMA(x2, h1, q2);
2903 97280 : q3 = _AVX512_NFMA(x3, h1, q3);
2904 97280 : q4 = _AVX512_NFMA(x4, h1, q4);
2905 :
2906 194560 : h2 = _AVX512_SET1(hh[ldh+nb-3]);
2907 :
2908 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
2909 97280 : q2 = _AVX512_NFMA(y2, h2, q2);
2910 97280 : q3 = _AVX512_NFMA(y3, h2, q3);
2911 97280 : q4 = _AVX512_NFMA(y4, h2, q4);
2912 :
2913 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
2914 :
2915 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
2916 97280 : q2 = _AVX512_NFMA(z2, h3, q2);
2917 97280 : q3 = _AVX512_NFMA(z3, h3, q3);
2918 97280 : q4 = _AVX512_NFMA(z4, h3, q4);
2919 :
2920 194560 : h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
2921 :
2922 97280 : q1 = _AVX512_NFMA(w1, h4, q1);
2923 97280 : q2 = _AVX512_NFMA(w2, h4, q2);
2924 97280 : q3 = _AVX512_NFMA(w3, h4, q3);
2925 97280 : q4 = _AVX512_NFMA(w4, h4, q4);
2926 :
2927 97280 : _AVX512_STORE(&q[(nb+1)*ldq],q1);
2928 97280 : _AVX512_STORE(&q[((nb+1)*ldq)+offset],q2);
2929 97280 : _AVX512_STORE(&q[((nb+1)*ldq)+2*offset],q3);
2930 97280 : _AVX512_STORE(&q[((nb+1)*ldq)+3*offset],q4);
2931 :
2932 194560 : h1 = _AVX512_SET1(hh[nb-3]);
2933 194560 : q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
2934 194560 : q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
2935 194560 : q3 = _AVX512_LOAD(&q[((nb+2)*ldq)+2*offset]);
2936 194560 : q4 = _AVX512_LOAD(&q[((nb+2)*ldq)+3*offset]);
2937 :
2938 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
2939 97280 : q2 = _AVX512_NFMA(x2, h1, q2);
2940 97280 : q3 = _AVX512_NFMA(x3, h1, q3);
2941 97280 : q4 = _AVX512_NFMA(x4, h1, q4);
2942 :
2943 194560 : h2 = _AVX512_SET1(hh[ldh+nb-2]);
2944 :
2945 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
2946 97280 : q2 = _AVX512_NFMA(y2, h2, q2);
2947 97280 : q3 = _AVX512_NFMA(y3, h2, q3);
2948 97280 : q4 = _AVX512_NFMA(y4, h2, q4);
2949 :
2950 194560 : h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
2951 :
2952 97280 : q1 = _AVX512_NFMA(z1, h3, q1);
2953 97280 : q2 = _AVX512_NFMA(z2, h3, q2);
2954 97280 : q3 = _AVX512_NFMA(z3, h3, q3);
2955 97280 : q4 = _AVX512_NFMA(z4, h3, q4);
2956 :
2957 97280 : _AVX512_STORE(&q[(nb+2)*ldq],q1);
2958 97280 : _AVX512_STORE(&q[((nb+2)*ldq)+offset],q2);
2959 97280 : _AVX512_STORE(&q[((nb+2)*ldq)+2*offset],q3);
2960 97280 : _AVX512_STORE(&q[((nb+2)*ldq)+3*offset],q4);
2961 :
2962 194560 : h1 = _AVX512_SET1(hh[nb-2]);
2963 194560 : q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
2964 194560 : q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
2965 194560 : q3 = _AVX512_LOAD(&q[((nb+3)*ldq)+2*offset]);
2966 194560 : q4 = _AVX512_LOAD(&q[((nb+3)*ldq)+3*offset]);
2967 :
2968 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
2969 97280 : q2 = _AVX512_NFMA(x2, h1, q2);
2970 97280 : q3 = _AVX512_NFMA(x3, h1, q3);
2971 97280 : q4 = _AVX512_NFMA(x4, h1, q4);
2972 :
2973 194560 : h2 = _AVX512_SET1(hh[ldh+nb-1]);
2974 :
2975 97280 : q1 = _AVX512_NFMA(y1, h2, q1);
2976 97280 : q2 = _AVX512_NFMA(y2, h2, q2);
2977 97280 : q3 = _AVX512_NFMA(y3, h2, q3);
2978 97280 : q4 = _AVX512_NFMA(y4, h2, q4);
2979 :
2980 97280 : _AVX512_STORE(&q[(nb+3)*ldq],q1);
2981 97280 : _AVX512_STORE(&q[((nb+3)*ldq)+offset],q2);
2982 97280 : _AVX512_STORE(&q[((nb+3)*ldq)+2*offset],q3);
2983 97280 : _AVX512_STORE(&q[((nb+3)*ldq)+3*offset],q4);
2984 :
2985 194560 : h1 = _AVX512_SET1(hh[nb-1]);
2986 194560 : q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
2987 194560 : q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
2988 194560 : q3 = _AVX512_LOAD(&q[((nb+4)*ldq)+2*offset]);
2989 194560 : q4 = _AVX512_LOAD(&q[((nb+4)*ldq)+3*offset]);
2990 :
2991 97280 : q1 = _AVX512_NFMA(x1, h1, q1);
2992 97280 : q2 = _AVX512_NFMA(x2, h1, q2);
2993 97280 : q3 = _AVX512_NFMA(x3, h1, q3);
2994 97280 : q4 = _AVX512_NFMA(x4, h1, q4);
2995 :
2996 97280 : _AVX512_STORE(&q[(nb+4)*ldq],q1);
2997 97280 : _AVX512_STORE(&q[((nb+4)*ldq)+offset],q2);
2998 97280 : _AVX512_STORE(&q[((nb+4)*ldq)+2*offset],q3);
2999 97280 : _AVX512_STORE(&q[((nb+4)*ldq)+3*offset],q4);
3000 :
3001 : }
3002 :
|