Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 :
63 : #include "config-f90.h"
64 :
65 : #include <x86intrin.h>
66 : #include <stdio.h>
67 : #include <stdlib.h>
68 :
69 : #define __forceinline __attribute__((always_inline)) static
70 :
71 : #ifdef DOUBLE_PRECISION_REAL
72 :
73 : #define offset 4
74 : #define __AVX_DATATYPE __m256d
75 : #define _AVX_LOAD _mm256_load_pd
76 : #define _AVX_STORE _mm256_store_pd
77 : #define _AVX_ADD _mm256_add_pd
78 : #define _AVX_MUL _mm256_mul_pd
79 : #define _AVX_BROADCAST _mm256_broadcast_sd
80 : #define _AVX_XOR _mm256_xor_pd
81 :
82 : #ifdef HAVE_AVX2
83 : #ifdef __FMA4__
84 : #define __ELPA_USE_FMA__
85 : #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
86 : #endif
87 :
88 : #ifdef __AVX2__
89 : #define __ELPA_USE_FMA__
90 : #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
91 : #endif
92 : #endif
93 :
94 : #define _AVX_FMA _mm256_FMA_pd
95 :
96 : #endif /* DOUBLE_PRECISION_REAL */
97 :
98 : #ifdef SINGLE_PRECISION_REAL
99 :
100 : #define offset 8
101 : #define __AVX_DATATYPE __m256
102 : #define _AVX_LOAD _mm256_load_ps
103 : #define _AVX_STORE _mm256_store_ps
104 : #define _AVX_ADD _mm256_add_ps
105 : #define _AVX_MUL _mm256_mul_ps
106 : #define _AVX_BROADCAST _mm256_broadcast_ss
107 : #define _AVX_XOR _mm256_xor_ps
108 :
109 : #ifdef HAVE_AVX2
110 : #ifdef __FMA4__
111 : #define __ELPA_USE_FMA__
112 : #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
113 : #endif
114 :
115 : #ifdef __AVX2__
116 : #define __ELPA_USE_FMA__
117 : #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
118 : #endif
119 : #endif
120 :
121 : #define _AVX_FMA _mm256_FMA_ps
122 : #endif /* SINGLE_PRECISION_REAL */
123 :
124 : #ifdef DOUBLE_PRECISION_REAL
125 : //Forward declaration
126 : __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
127 : __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
128 : __forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
129 : __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
130 : __forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
131 : __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
132 : #endif
133 :
134 : #ifdef SINGLE_PRECISION_REAL
135 : //Forward declaration
136 : __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
137 : __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
138 : __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
139 : __forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
140 : __forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
141 : __forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
142 : #endif
143 :
144 : #ifdef DOUBLE_PRECISION_REAL
145 : void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
146 : #endif
147 : #ifdef SINGLE_PRECISION_REAL
148 : void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
149 : #endif
150 :
151 : #ifdef DOUBLE_PRECISION_REAL
152 : /*
153 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
154 : !f> interface
155 : !f> subroutine double_hh_trafo_real_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
156 : !f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_double")
157 : !f> use, intrinsic :: iso_c_binding
158 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
159 : !f> type(c_ptr), value :: q
160 : !f> real(kind=c_double) :: hh(pnb,6)
161 : !f> end subroutine
162 : !f> end interface
163 : !f>#endif
164 : */
165 : #endif
166 : #ifdef SINGLE_PRECISION_REAL
167 : /*
168 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
169 : !f> interface
170 : !f> subroutine double_hh_trafo_real_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
171 : !f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_single")
172 : !f> use, intrinsic :: iso_c_binding
173 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
174 : !f> type(c_ptr), value :: q
175 : !f> real(kind=c_float) :: hh(pnb,6)
176 : !f> end subroutine
177 : !f> end interface
178 : !f>#endif
179 : */
180 : #endif
181 :
182 : #ifdef DOUBLE_PRECISION_REAL
183 19579648 : void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
184 : #endif
185 : #ifdef SINGLE_PRECISION_REAL
186 3595968 : void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
187 : #endif
188 : {
189 : int i;
190 23175616 : int nb = *pnb;
191 23175616 : int nq = *pldq;
192 23175616 : int ldq = *pldq;
193 23175616 : int ldh = *pldh;
194 : int worked_on;
195 :
196 23175616 : worked_on = 0;
197 :
198 : // calculating scalar product to compute
199 : // 2 householder vectors simultaneously
200 : #ifdef DOUBLE_PRECISION_REAL
201 19579648 : double s = hh[(ldh)+1]*1.0;
202 : #endif
203 : #ifdef SINGLE_PRECISION_REAL
204 3595968 : float s = hh[(ldh)+1]*1.0f;
205 : #endif
206 : #pragma ivdep
207 1448221248 : for (i = 2; i < nb; i++)
208 : {
209 1425045632 : s += hh[i-1] * hh[(i+ldh)];
210 : }
211 :
212 : // Production level kernel calls with padding
213 : #ifdef DOUBLE_PRECISION_REAL
214 56731136 : for (i = 0; i < nq-20; i+=24)
215 : {
216 37151488 : hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
217 37151488 : worked_on += 24;
218 : }
219 : #endif
220 : #ifdef SINGLE_PRECISION_REAL
221 7191936 : for (i = 0; i < nq-40; i+=48)
222 : {
223 3595968 : hh_trafo_kernel_48_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
224 3595968 : worked_on += 48;
225 : }
226 : #endif
227 :
228 23175616 : if (nq == i)
229 : {
230 17571840 : return;
231 : }
232 : #ifdef DOUBLE_PRECISION_REAL
233 2007808 : if (nq-i == 20)
234 : {
235 0 : hh_trafo_kernel_20_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
236 0 : worked_on += 20;
237 : }
238 : #endif
239 :
240 : #ifdef SINGLE_PRECISION_REAL
241 3595968 : if (nq-i == 40)
242 : {
243 3194880 : hh_trafo_kernel_40_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
244 3194880 : worked_on += 40;
245 : }
246 : #endif
247 :
248 :
249 : #ifdef DOUBLE_PRECISION_REAL
250 2007808 : if (nq-i == 16)
251 : {
252 2007808 : hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
253 2007808 : worked_on += 16;
254 : }
255 : #endif
256 :
257 : #ifdef SINGLE_PRECISION_REAL
258 3595968 : if (nq-i == 32)
259 : {
260 401088 : hh_trafo_kernel_32_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
261 401088 : worked_on += 32;
262 : }
263 : #endif
264 :
265 :
266 : #ifdef DOUBLE_PRECISION_REAL
267 2007808 : if (nq-i == 12)
268 : {
269 0 : hh_trafo_kernel_12_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
270 0 : worked_on += 12;
271 : }
272 : #endif
273 :
274 : #ifdef SINGLE_PRECISION_REAL
275 3595968 : if (nq-i == 24)
276 : {
277 0 : hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
278 0 : worked_on += 24;
279 : }
280 : #endif
281 :
282 : #ifdef DOUBLE_PRECISION_REAL
283 2007808 : if (nq-i == 8)
284 : {
285 0 : hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
286 0 : worked_on += 8;
287 : }
288 : #endif
289 :
290 : #ifdef SINGLE_PRECISION_REAL
291 3595968 : if (nq-i == 16)
292 : {
293 0 : hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
294 0 : worked_on += 16;
295 : }
296 : #endif
297 :
298 : #ifdef DOUBLE_PRECISION_REAL
299 2007808 : if (nq-i == 4)
300 : {
301 0 : hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
302 0 : worked_on += 4;
303 : }
304 : #endif
305 :
306 : #ifdef SINGLE_PRECISION_REAL
307 3595968 : if (nq-i == 8)
308 : {
309 0 : hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
310 0 : worked_on += 8;
311 : }
312 : #endif
313 : #ifdef WITH_DEBUG
314 : if (worked_on != nq)
315 : {
316 : printf("Error in real avx/avx2 BLOCK 2 kernel \n");
317 : abort();
318 : }
319 : #endif
320 : }
321 :
322 : /**
323 : * Unrolled kernel that computes
324 : #ifdef DOUBLE_PRECISION_REAL
325 : * 24 rows of Q simultaneously, a
326 : #endif
327 : #ifdef SINGLE_PRECISION_REAL
328 : * 48 rows of Q simultaneously, a
329 : #endif
330 : * matrix Vector product with two householder
331 : * vectors + a rank 2 update is performed
332 : */
333 : #ifdef DOUBLE_PRECISION_REAL
334 : __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
335 : #endif
336 : #ifdef SINGLE_PRECISION_REAL
337 : __forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
338 : #endif
339 : {
340 : /////////////////////////////////////////////////////
341 : // Matrix Vector Multiplication, Q [24 x nb+1] * hh
342 : // hh contains two householder vectors, with offset 1
343 : /////////////////////////////////////////////////////
344 : int i;
345 : // Needed bit mask for floating point sign flip
346 : #ifdef DOUBLE_PRECISION_REAL
347 37151488 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
348 : #endif
349 : #ifdef SINGLE_PRECISION_REAL
350 3595968 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
351 : #endif
352 :
353 81494912 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
354 81494912 : __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
355 81494912 : __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
356 81494912 : __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
357 81494912 : __AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
358 81494912 : __AVX_DATATYPE x6 = _AVX_LOAD(&q[ldq+5*offset]);
359 :
360 81494912 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
361 : __AVX_DATATYPE h2;
362 :
363 : #ifdef __ELPA_USE_FMA__
364 40747456 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
365 40747456 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
366 81494912 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
367 40747456 : __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
368 81494912 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
369 40747456 : __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
370 81494912 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
371 40747456 : __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
372 81494912 : __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
373 40747456 : __AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
374 81494912 : __AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
375 40747456 : __AVX_DATATYPE y6 = _AVX_FMA(x6, h1, q6);
376 :
377 : #else
378 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
379 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
380 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
381 : __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
382 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
383 : __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
384 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
385 : __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
386 : __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
387 : __AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
388 : __AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
389 : __AVX_DATATYPE y6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
390 :
391 : #endif
392 :
393 2555247168 : for(i = 2; i < nb; i++)
394 : {
395 5028999424 : h1 = _AVX_BROADCAST(&hh[i-1]);
396 5028999424 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
397 : #ifdef __ELPA_USE_FMA__
398 5028999424 : q1 = _AVX_LOAD(&q[i*ldq]);
399 2514499712 : x1 = _AVX_FMA(q1, h1, x1);
400 2514499712 : y1 = _AVX_FMA(q1, h2, y1);
401 5028999424 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
402 2514499712 : x2 = _AVX_FMA(q2, h1, x2);
403 2514499712 : y2 = _AVX_FMA(q2, h2, y2);
404 5028999424 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
405 2514499712 : x3 = _AVX_FMA(q3, h1, x3);
406 2514499712 : y3 = _AVX_FMA(q3, h2, y3);
407 5028999424 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
408 2514499712 : x4 = _AVX_FMA(q4, h1, x4);
409 2514499712 : y4 = _AVX_FMA(q4, h2, y4);
410 5028999424 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
411 2514499712 : x5 = _AVX_FMA(q5, h1, x5);
412 2514499712 : y5 = _AVX_FMA(q5, h2, y5);
413 5028999424 : q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
414 2514499712 : x6 = _AVX_FMA(q6, h1, x6);
415 2514499712 : y6 = _AVX_FMA(q6, h2, y6);
416 :
417 : #else
418 : q1 = _AVX_LOAD(&q[i*ldq]);
419 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
420 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
421 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
422 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
423 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
424 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
425 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
426 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
427 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
428 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
429 : y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
430 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
431 : x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
432 : y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
433 : q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
434 : x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
435 : y6 = _AVX_ADD(y6, _AVX_MUL(q6,h2));
436 :
437 : #endif
438 : }
439 :
440 81494912 : h1 = _AVX_BROADCAST(&hh[nb-1]);
441 : #ifdef __ELPA_USE_FMA__
442 81494912 : q1 = _AVX_LOAD(&q[nb*ldq]);
443 40747456 : x1 = _AVX_FMA(q1, h1, x1);
444 81494912 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
445 40747456 : x2 = _AVX_FMA(q2, h1, x2);
446 81494912 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
447 40747456 : x3 = _AVX_FMA(q3, h1, x3);
448 81494912 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
449 40747456 : x4 = _AVX_FMA(q4, h1, x4);
450 81494912 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
451 40747456 : x5 = _AVX_FMA(q5, h1, x5);
452 81494912 : q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
453 40747456 : x6 = _AVX_FMA(q6, h1, x6);
454 :
455 : #else
456 : q1 = _AVX_LOAD(&q[nb*ldq]);
457 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
458 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
459 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
460 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
461 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
462 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
463 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
464 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
465 : x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
466 : q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
467 : x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
468 :
469 : #endif
470 :
471 : /////////////////////////////////////////////////////
472 : // Rank-2 update of Q [24 x nb+1]
473 : /////////////////////////////////////////////////////
474 :
475 40747456 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
476 81494912 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
477 40747456 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
478 :
479 40747456 : h1 = _AVX_XOR(tau1, sign);
480 40747456 : x1 = _AVX_MUL(x1, h1);
481 40747456 : x2 = _AVX_MUL(x2, h1);
482 40747456 : x3 = _AVX_MUL(x3, h1);
483 40747456 : x4 = _AVX_MUL(x4, h1);
484 40747456 : x5 = _AVX_MUL(x5, h1);
485 40747456 : x6 = _AVX_MUL(x6, h1);
486 :
487 40747456 : h1 = _AVX_XOR(tau2, sign);
488 40747456 : h2 = _AVX_MUL(h1, vs);
489 : #ifdef __ELPA_USE_FMA__
490 81494912 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
491 81494912 : y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
492 81494912 : y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
493 81494912 : y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
494 81494912 : y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
495 81494912 : y6 = _AVX_FMA(y6, h1, _AVX_MUL(x6,h2));
496 :
497 : #else
498 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
499 : y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
500 : y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
501 : y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
502 : y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
503 : y6 = _AVX_ADD(_AVX_MUL(y6,h1), _AVX_MUL(x6,h2));
504 :
505 : #endif
506 :
507 40747456 : q1 = _AVX_LOAD(q);
508 40747456 : q1 = _AVX_ADD(q1, y1);
509 : _AVX_STORE(q,q1);
510 81494912 : q2 = _AVX_LOAD(&q[offset]);
511 40747456 : q2 = _AVX_ADD(q2, y2);
512 40747456 : _AVX_STORE(&q[offset],q2);
513 81494912 : q3 = _AVX_LOAD(&q[2*offset]);
514 40747456 : q3 = _AVX_ADD(q3, y3);
515 40747456 : _AVX_STORE(&q[2*offset],q3);
516 81494912 : q4 = _AVX_LOAD(&q[3*offset]);
517 40747456 : q4 = _AVX_ADD(q4, y4);
518 40747456 : _AVX_STORE(&q[3*offset],q4);
519 81494912 : q5 = _AVX_LOAD(&q[4*offset]);
520 40747456 : q5 = _AVX_ADD(q5, y5);
521 40747456 : _AVX_STORE(&q[4*offset],q5);
522 81494912 : q6 = _AVX_LOAD(&q[5*offset]);
523 40747456 : q6 = _AVX_ADD(q6, y6);
524 40747456 : _AVX_STORE(&q[5*offset],q6);
525 :
526 81494912 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
527 : #ifdef __ELPA_USE_FMA__
528 81494912 : q1 = _AVX_LOAD(&q[ldq]);
529 81494912 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
530 40747456 : _AVX_STORE(&q[ldq],q1);
531 81494912 : q2 = _AVX_LOAD(&q[ldq+offset]);
532 81494912 : q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
533 40747456 : _AVX_STORE(&q[ldq+offset],q2);
534 81494912 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
535 81494912 : q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
536 40747456 : _AVX_STORE(&q[ldq+2*offset],q3);
537 81494912 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
538 81494912 : q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
539 40747456 : _AVX_STORE(&q[ldq+3*offset],q4);
540 81494912 : q5 = _AVX_LOAD(&q[ldq+4*offset]);
541 81494912 : q5 = _AVX_ADD(q5, _AVX_FMA(y5, h2, x5));
542 40747456 : _AVX_STORE(&q[ldq+4*offset],q5);
543 81494912 : q6 = _AVX_LOAD(&q[ldq+5*offset]);
544 81494912 : q6 = _AVX_ADD(q6, _AVX_FMA(y6, h2, x6));
545 40747456 : _AVX_STORE(&q[ldq+5*offset],q6);
546 :
547 : #else
548 : q1 = _AVX_LOAD(&q[ldq]);
549 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
550 : _AVX_STORE(&q[ldq],q1);
551 : q2 = _AVX_LOAD(&q[ldq+offset]);
552 : q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
553 : _AVX_STORE(&q[ldq+offset],q2);
554 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
555 : q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
556 : _AVX_STORE(&q[ldq+2*offset],q3);
557 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
558 : q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
559 : _AVX_STORE(&q[ldq+3*offset],q4);
560 : q5 = _AVX_LOAD(&q[ldq+4*offset]);
561 : q5 = _AVX_ADD(q5, _AVX_ADD(x5, _AVX_MUL(y5, h2)));
562 : _AVX_STORE(&q[ldq+4*offset],q5);
563 : q6 = _AVX_LOAD(&q[ldq+5*offset]);
564 : q6 = _AVX_ADD(q6, _AVX_ADD(x6, _AVX_MUL(y6, h2)));
565 : _AVX_STORE(&q[ldq+5*offset],q6);
566 :
567 : #endif
568 :
569 2555247168 : for (i = 2; i < nb; i++)
570 : {
571 5028999424 : h1 = _AVX_BROADCAST(&hh[i-1]);
572 5028999424 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
573 : #ifdef __ELPA_USE_FMA__
574 5028999424 : q1 = _AVX_LOAD(&q[i*ldq]);
575 2514499712 : q1 = _AVX_FMA(x1, h1, q1);
576 2514499712 : q1 = _AVX_FMA(y1, h2, q1);
577 2514499712 : _AVX_STORE(&q[i*ldq],q1);
578 5028999424 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
579 2514499712 : q2 = _AVX_FMA(x2, h1, q2);
580 2514499712 : q2 = _AVX_FMA(y2, h2, q2);
581 2514499712 : _AVX_STORE(&q[(i*ldq)+offset],q2);
582 5028999424 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
583 2514499712 : q3 = _AVX_FMA(x3, h1, q3);
584 2514499712 : q3 = _AVX_FMA(y3, h2, q3);
585 2514499712 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
586 5028999424 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
587 2514499712 : q4 = _AVX_FMA(x4, h1, q4);
588 2514499712 : q4 = _AVX_FMA(y4, h2, q4);
589 2514499712 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
590 5028999424 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
591 2514499712 : q5 = _AVX_FMA(x5, h1, q5);
592 2514499712 : q5 = _AVX_FMA(y5, h2, q5);
593 2514499712 : _AVX_STORE(&q[(i*ldq)+4*offset],q5);
594 5028999424 : q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
595 2514499712 : q6 = _AVX_FMA(x6, h1, q6);
596 2514499712 : q6 = _AVX_FMA(y6, h2, q6);
597 2514499712 : _AVX_STORE(&q[(i*ldq)+5*offset],q6);
598 :
599 : #else
600 : q1 = _AVX_LOAD(&q[i*ldq]);
601 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
602 : _AVX_STORE(&q[i*ldq],q1);
603 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
604 : q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
605 : _AVX_STORE(&q[(i*ldq)+offset],q2);
606 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
607 : q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
608 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
609 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
610 : q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
611 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
612 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
613 : q5 = _AVX_ADD(q5, _AVX_ADD(_AVX_MUL(x5,h1), _AVX_MUL(y5, h2)));
614 : _AVX_STORE(&q[(i*ldq)+4*offset],q5);
615 : q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
616 : q6 = _AVX_ADD(q6, _AVX_ADD(_AVX_MUL(x6,h1), _AVX_MUL(y6, h2)));
617 : _AVX_STORE(&q[(i*ldq)+5*offset],q6);
618 :
619 : #endif
620 : }
621 :
622 81494912 : h1 = _AVX_BROADCAST(&hh[nb-1]);
623 : #ifdef __ELPA_USE_FMA__
624 81494912 : q1 = _AVX_LOAD(&q[nb*ldq]);
625 40747456 : q1 = _AVX_FMA(x1, h1, q1);
626 40747456 : _AVX_STORE(&q[nb*ldq],q1);
627 81494912 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
628 40747456 : q2 = _AVX_FMA(x2, h1, q2);
629 40747456 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
630 81494912 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
631 40747456 : q3 = _AVX_FMA(x3, h1, q3);
632 40747456 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
633 81494912 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
634 40747456 : q4 = _AVX_FMA(x4, h1, q4);
635 40747456 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
636 81494912 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
637 40747456 : q5 = _AVX_FMA(x5, h1, q5);
638 40747456 : _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
639 81494912 : q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
640 40747456 : q6 = _AVX_FMA(x6, h1, q6);
641 40747456 : _AVX_STORE(&q[(nb*ldq)+5*offset],q6);
642 :
643 : #else
644 : q1 = _AVX_LOAD(&q[nb*ldq]);
645 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
646 : _AVX_STORE(&q[nb*ldq],q1);
647 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
648 : q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
649 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
650 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
651 : q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
652 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
653 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
654 : q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
655 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
656 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
657 : q5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
658 : _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
659 : q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
660 : q6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
661 : _AVX_STORE(&q[(nb*ldq)+5*offset],q6);
662 :
663 : #endif
664 : }
665 :
666 :
667 : /**
668 : * Unrolled kernel that computes
669 : #ifdef DOUBLE_PRECISION_REAL
670 : * 20 rows of Q simultaneously, a
671 : #endif
672 : #ifdef SINGLE_PRECISION_REAL
673 : * 40 rows of Q simultaneously, a
674 : #endif
675 : * matrix Vector product with two householder
676 : * vectors + a rank 2 update is performed
677 : */
678 : #ifdef DOUBLE_PRECISION_REAL
679 : __forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
680 : #endif
681 : #ifdef SINGLE_PRECISION_REAL
682 : __forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
683 : #endif
684 : {
685 : /////////////////////////////////////////////////////
686 : // Matrix Vector Multiplication, Q [24 x nb+1] * hh
687 : // hh contains two householder vectors, with offset 1
688 : /////////////////////////////////////////////////////
689 : int i;
690 : // Needed bit mask for floating point sign flip
691 : #ifdef DOUBLE_PRECISION_REAL
692 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
693 : #endif
694 : #ifdef SINGLE_PRECISION_REAL
695 3194880 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
696 : #endif
697 :
698 6389760 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
699 6389760 : __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
700 6389760 : __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
701 6389760 : __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
702 6389760 : __AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
703 :
704 6389760 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
705 : __AVX_DATATYPE h2;
706 :
707 : #ifdef __ELPA_USE_FMA__
708 3194880 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
709 3194880 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
710 6389760 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
711 3194880 : __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
712 6389760 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
713 3194880 : __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
714 6389760 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
715 3194880 : __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
716 6389760 : __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
717 3194880 : __AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
718 :
719 : #else
720 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
721 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
722 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
723 : __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
724 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
725 : __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
726 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
727 : __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
728 : __AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
729 : __AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
730 :
731 : #endif
732 :
733 201277440 : for(i = 2; i < nb; i++)
734 : {
735 396165120 : h1 = _AVX_BROADCAST(&hh[i-1]);
736 396165120 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
737 : #ifdef __ELPA_USE_FMA__
738 396165120 : q1 = _AVX_LOAD(&q[i*ldq]);
739 198082560 : x1 = _AVX_FMA(q1, h1, x1);
740 198082560 : y1 = _AVX_FMA(q1, h2, y1);
741 396165120 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
742 198082560 : x2 = _AVX_FMA(q2, h1, x2);
743 198082560 : y2 = _AVX_FMA(q2, h2, y2);
744 396165120 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
745 198082560 : x3 = _AVX_FMA(q3, h1, x3);
746 198082560 : y3 = _AVX_FMA(q3, h2, y3);
747 396165120 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
748 198082560 : x4 = _AVX_FMA(q4, h1, x4);
749 198082560 : y4 = _AVX_FMA(q4, h2, y4);
750 396165120 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
751 198082560 : x5 = _AVX_FMA(q5, h1, x5);
752 198082560 : y5 = _AVX_FMA(q5, h2, y5);
753 :
754 : #else
755 : q1 = _AVX_LOAD(&q[i*ldq]);
756 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
757 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
758 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
759 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
760 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
761 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
762 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
763 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
764 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
765 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
766 : y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
767 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
768 : x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
769 : y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
770 :
771 : #endif
772 : }
773 :
774 6389760 : h1 = _AVX_BROADCAST(&hh[nb-1]);
775 : #ifdef __ELPA_USE_FMA__
776 6389760 : q1 = _AVX_LOAD(&q[nb*ldq]);
777 3194880 : x1 = _AVX_FMA(q1, h1, x1);
778 6389760 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
779 3194880 : x2 = _AVX_FMA(q2, h1, x2);
780 6389760 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
781 3194880 : x3 = _AVX_FMA(q3, h1, x3);
782 6389760 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
783 3194880 : x4 = _AVX_FMA(q4, h1, x4);
784 6389760 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
785 3194880 : x5 = _AVX_FMA(q5, h1, x5);
786 :
787 : #else
788 : q1 = _AVX_LOAD(&q[nb*ldq]);
789 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
790 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
791 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
792 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
793 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
794 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
795 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
796 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
797 : x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
798 :
799 : #endif
800 :
801 : /////////////////////////////////////////////////////
802 : // Rank-2 update of Q [24 x nb+1]
803 : /////////////////////////////////////////////////////
804 :
805 3194880 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
806 6389760 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
807 3194880 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
808 :
809 3194880 : h1 = _AVX_XOR(tau1, sign);
810 3194880 : x1 = _AVX_MUL(x1, h1);
811 3194880 : x2 = _AVX_MUL(x2, h1);
812 3194880 : x3 = _AVX_MUL(x3, h1);
813 3194880 : x4 = _AVX_MUL(x4, h1);
814 3194880 : x5 = _AVX_MUL(x5, h1);
815 :
816 3194880 : h1 = _AVX_XOR(tau2, sign);
817 3194880 : h2 = _AVX_MUL(h1, vs);
818 : #ifdef __ELPA_USE_FMA__
819 6389760 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
820 6389760 : y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
821 6389760 : y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
822 6389760 : y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
823 6389760 : y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
824 :
825 : #else
826 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
827 : y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
828 : y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
829 : y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
830 : y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
831 :
832 : #endif
833 :
834 3194880 : q1 = _AVX_LOAD(q);
835 3194880 : q1 = _AVX_ADD(q1, y1);
836 : _AVX_STORE(q,q1);
837 6389760 : q2 = _AVX_LOAD(&q[offset]);
838 3194880 : q2 = _AVX_ADD(q2, y2);
839 3194880 : _AVX_STORE(&q[offset],q2);
840 6389760 : q3 = _AVX_LOAD(&q[2*offset]);
841 3194880 : q3 = _AVX_ADD(q3, y3);
842 3194880 : _AVX_STORE(&q[2*offset],q3);
843 6389760 : q4 = _AVX_LOAD(&q[3*offset]);
844 3194880 : q4 = _AVX_ADD(q4, y4);
845 3194880 : _AVX_STORE(&q[3*offset],q4);
846 6389760 : q5 = _AVX_LOAD(&q[4*offset]);
847 3194880 : q5 = _AVX_ADD(q5, y5);
848 3194880 : _AVX_STORE(&q[4*offset],q5);
849 :
850 6389760 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
851 : #ifdef __ELPA_USE_FMA__
852 6389760 : q1 = _AVX_LOAD(&q[ldq]);
853 6389760 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
854 3194880 : _AVX_STORE(&q[ldq],q1);
855 6389760 : q2 = _AVX_LOAD(&q[ldq+offset]);
856 6389760 : q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
857 3194880 : _AVX_STORE(&q[ldq+offset],q2);
858 6389760 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
859 6389760 : q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
860 3194880 : _AVX_STORE(&q[ldq+2*offset],q3);
861 6389760 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
862 6389760 : q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
863 3194880 : _AVX_STORE(&q[ldq+3*offset],q4);
864 6389760 : q5 = _AVX_LOAD(&q[ldq+4*offset]);
865 6389760 : q5 = _AVX_ADD(q5, _AVX_FMA(y5, h2, x5));
866 3194880 : _AVX_STORE(&q[ldq+4*offset],q5);
867 :
868 : #else
869 : q1 = _AVX_LOAD(&q[ldq]);
870 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
871 : _AVX_STORE(&q[ldq],q1);
872 : q2 = _AVX_LOAD(&q[ldq+offset]);
873 : q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
874 : _AVX_STORE(&q[ldq+offset],q2);
875 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
876 : q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
877 : _AVX_STORE(&q[ldq+2*offset],q3);
878 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
879 : q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
880 : _AVX_STORE(&q[ldq+3*offset],q4);
881 : q5 = _AVX_LOAD(&q[ldq+4*offset]);
882 : q5 = _AVX_ADD(q5, _AVX_ADD(x5, _AVX_MUL(y5, h2)));
883 : _AVX_STORE(&q[ldq+4*offset],q5);
884 :
885 : #endif
886 :
887 201277440 : for (i = 2; i < nb; i++)
888 : {
889 396165120 : h1 = _AVX_BROADCAST(&hh[i-1]);
890 396165120 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
891 : #ifdef __ELPA_USE_FMA__
892 396165120 : q1 = _AVX_LOAD(&q[i*ldq]);
893 198082560 : q1 = _AVX_FMA(x1, h1, q1);
894 198082560 : q1 = _AVX_FMA(y1, h2, q1);
895 198082560 : _AVX_STORE(&q[i*ldq],q1);
896 396165120 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
897 198082560 : q2 = _AVX_FMA(x2, h1, q2);
898 198082560 : q2 = _AVX_FMA(y2, h2, q2);
899 198082560 : _AVX_STORE(&q[(i*ldq)+offset],q2);
900 396165120 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
901 198082560 : q3 = _AVX_FMA(x3, h1, q3);
902 198082560 : q3 = _AVX_FMA(y3, h2, q3);
903 198082560 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
904 396165120 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
905 198082560 : q4 = _AVX_FMA(x4, h1, q4);
906 198082560 : q4 = _AVX_FMA(y4, h2, q4);
907 198082560 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
908 396165120 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
909 198082560 : q5 = _AVX_FMA(x5, h1, q5);
910 198082560 : q5 = _AVX_FMA(y5, h2, q5);
911 198082560 : _AVX_STORE(&q[(i*ldq)+4*offset],q5);
912 :
913 : #else
914 : q1 = _AVX_LOAD(&q[i*ldq]);
915 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
916 : _AVX_STORE(&q[i*ldq],q1);
917 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
918 : q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
919 : _AVX_STORE(&q[(i*ldq)+offset],q2);
920 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
921 : q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
922 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
923 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
924 : q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
925 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
926 : q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
927 : q5 = _AVX_ADD(q5, _AVX_ADD(_AVX_MUL(x5,h1), _AVX_MUL(y5, h2)));
928 : _AVX_STORE(&q[(i*ldq)+4*offset],q5);
929 :
930 : #endif
931 : }
932 :
933 6389760 : h1 = _AVX_BROADCAST(&hh[nb-1]);
934 : #ifdef __ELPA_USE_FMA__
935 6389760 : q1 = _AVX_LOAD(&q[nb*ldq]);
936 3194880 : q1 = _AVX_FMA(x1, h1, q1);
937 3194880 : _AVX_STORE(&q[nb*ldq],q1);
938 6389760 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
939 3194880 : q2 = _AVX_FMA(x2, h1, q2);
940 3194880 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
941 6389760 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
942 3194880 : q3 = _AVX_FMA(x3, h1, q3);
943 3194880 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
944 6389760 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
945 3194880 : q4 = _AVX_FMA(x4, h1, q4);
946 3194880 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
947 6389760 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
948 3194880 : q5 = _AVX_FMA(x5, h1, q5);
949 3194880 : _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
950 :
951 : #else
952 : q1 = _AVX_LOAD(&q[nb*ldq]);
953 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
954 : _AVX_STORE(&q[nb*ldq],q1);
955 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
956 : q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
957 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
958 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
959 : q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
960 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
961 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
962 : q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
963 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
964 : q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
965 : q5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
966 : _AVX_STORE(&q[(nb*ldq)+4*offset],q5);
967 :
968 : #endif
969 : }
970 :
971 : /**
972 : * Unrolled kernel that computes
973 : #ifdef DOUBLE_PRECISION_REAL
974 : * 16 rows of Q simultaneously, a
975 : #endif
976 : #ifdef SINGLE_PRECISION_REAL
977 : * 32 rows of Q simultaneously, a
978 : #endif
979 : * matrix Vector product with two householder
980 : * vectors + a rank 2 update is performed
981 : */
982 : #ifdef DOUBLE_PRECISION_REAL
983 : __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
984 : #endif
985 : #ifdef SINGLE_PRECISION_REAL
986 : __forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
987 : #endif
988 : {
989 : /////////////////////////////////////////////////////
990 : // Matrix Vector Multiplication, Q [16 x nb+1] * hh
991 : // hh contains two householder vectors, with offset 1
992 : /////////////////////////////////////////////////////
993 : int i;
994 : // Needed bit mask for floating point sign flip
995 : #ifdef DOUBLE_PRECISION_REAL
996 2007808 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
997 : #endif
998 : #ifdef SINGLE_PRECISION_REAL
999 401088 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
1000 : #endif
1001 :
1002 4817792 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
1003 4817792 : __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
1004 4817792 : __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
1005 4817792 : __AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
1006 :
1007 4817792 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
1008 : __AVX_DATATYPE h2;
1009 :
1010 : #ifdef __ELPA_USE_FMA__
1011 2408896 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1012 2408896 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
1013 4817792 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1014 2408896 : __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
1015 4817792 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
1016 2408896 : __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
1017 4817792 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
1018 2408896 : __AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
1019 :
1020 : #else
1021 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1022 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1023 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1024 : __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1025 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
1026 : __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
1027 : __AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
1028 : __AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
1029 :
1030 : #endif
1031 :
1032 139917888 : for(i = 2; i < nb; i++)
1033 : {
1034 275017984 : h1 = _AVX_BROADCAST(&hh[i-1]);
1035 275017984 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1036 : #ifdef __ELPA_USE_FMA__
1037 275017984 : q1 = _AVX_LOAD(&q[i*ldq]);
1038 137508992 : x1 = _AVX_FMA(q1, h1, x1);
1039 137508992 : y1 = _AVX_FMA(q1, h2, y1);
1040 275017984 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1041 137508992 : x2 = _AVX_FMA(q2, h1, x2);
1042 137508992 : y2 = _AVX_FMA(q2, h2, y2);
1043 275017984 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1044 137508992 : x3 = _AVX_FMA(q3, h1, x3);
1045 137508992 : y3 = _AVX_FMA(q3, h2, y3);
1046 275017984 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
1047 137508992 : x4 = _AVX_FMA(q4, h1, x4);
1048 137508992 : y4 = _AVX_FMA(q4, h2, y4);
1049 :
1050 : #else
1051 : q1 = _AVX_LOAD(&q[i*ldq]);
1052 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1053 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1054 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1055 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1056 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
1057 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1058 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
1059 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
1060 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
1061 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
1062 : y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
1063 :
1064 : #endif
1065 : }
1066 :
1067 4817792 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1068 : #ifdef __ELPA_USE_FMA__
1069 4817792 : q1 = _AVX_LOAD(&q[nb*ldq]);
1070 2408896 : x1 = _AVX_FMA(q1, h1, x1);
1071 4817792 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1072 2408896 : x2 = _AVX_FMA(q2, h1, x2);
1073 4817792 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1074 2408896 : x3 = _AVX_FMA(q3, h1, x3);
1075 4817792 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
1076 2408896 : x4 = _AVX_FMA(q4, h1, x4);
1077 :
1078 : #else
1079 : q1 = _AVX_LOAD(&q[nb*ldq]);
1080 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1081 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1082 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1083 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1084 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
1085 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
1086 : x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
1087 :
1088 : #endif
1089 :
1090 : /////////////////////////////////////////////////////
1091 : // Rank-2 update of Q [16 x nb+1]
1092 : /////////////////////////////////////////////////////
1093 :
1094 2408896 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
1095 4817792 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1096 2408896 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
1097 :
1098 2408896 : h1 = _AVX_XOR(tau1, sign);
1099 2408896 : x1 = _AVX_MUL(x1, h1);
1100 2408896 : x2 = _AVX_MUL(x2, h1);
1101 2408896 : x3 = _AVX_MUL(x3, h1);
1102 2408896 : x4 = _AVX_MUL(x4, h1);
1103 2408896 : h1 = _AVX_XOR(tau2, sign);
1104 2408896 : h2 = _AVX_MUL(h1, vs);
1105 : #ifdef __ELPA_USE_FMA__
1106 4817792 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
1107 4817792 : y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
1108 4817792 : y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
1109 4817792 : y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
1110 :
1111 : #else
1112 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1113 : y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
1114 : y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
1115 : y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
1116 :
1117 : #endif
1118 :
1119 2408896 : q1 = _AVX_LOAD(q);
1120 2408896 : q1 = _AVX_ADD(q1, y1);
1121 : _AVX_STORE(q,q1);
1122 4817792 : q2 = _AVX_LOAD(&q[offset]);
1123 2408896 : q2 = _AVX_ADD(q2, y2);
1124 2408896 : _AVX_STORE(&q[offset],q2);
1125 4817792 : q3 = _AVX_LOAD(&q[2*offset]);
1126 2408896 : q3 = _AVX_ADD(q3, y3);
1127 2408896 : _AVX_STORE(&q[2*offset],q3);
1128 4817792 : q4 = _AVX_LOAD(&q[3*offset]);
1129 2408896 : q4 = _AVX_ADD(q4, y4);
1130 2408896 : _AVX_STORE(&q[3*offset],q4);
1131 :
1132 4817792 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1133 : #ifdef __ELPA_USE_FMA__
1134 4817792 : q1 = _AVX_LOAD(&q[ldq]);
1135 4817792 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
1136 2408896 : _AVX_STORE(&q[ldq],q1);
1137 4817792 : q2 = _AVX_LOAD(&q[ldq+offset]);
1138 4817792 : q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
1139 2408896 : _AVX_STORE(&q[ldq+offset],q2);
1140 4817792 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
1141 4817792 : q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
1142 2408896 : _AVX_STORE(&q[ldq+2*offset],q3);
1143 4817792 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
1144 4817792 : q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
1145 2408896 : _AVX_STORE(&q[ldq+3*offset],q4);
1146 :
1147 : #else
1148 : q1 = _AVX_LOAD(&q[ldq]);
1149 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
1150 : _AVX_STORE(&q[ldq],q1);
1151 : q2 = _AVX_LOAD(&q[ldq+offset]);
1152 : q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
1153 : _AVX_STORE(&q[ldq+offset],q2);
1154 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
1155 : q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
1156 : _AVX_STORE(&q[ldq+2*offset],q3);
1157 : q4 = _AVX_LOAD(&q[ldq+3*offset]);
1158 : q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
1159 : _AVX_STORE(&q[ldq+3*offset],q4);
1160 :
1161 : #endif
1162 :
1163 139917888 : for (i = 2; i < nb; i++)
1164 : {
1165 275017984 : h1 = _AVX_BROADCAST(&hh[i-1]);
1166 275017984 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1167 : #ifdef __ELPA_USE_FMA__
1168 275017984 : q1 = _AVX_LOAD(&q[i*ldq]);
1169 137508992 : q1 = _AVX_FMA(x1, h1, q1);
1170 137508992 : q1 = _AVX_FMA(y1, h2, q1);
1171 137508992 : _AVX_STORE(&q[i*ldq],q1);
1172 275017984 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1173 137508992 : q2 = _AVX_FMA(x2, h1, q2);
1174 137508992 : q2 = _AVX_FMA(y2, h2, q2);
1175 137508992 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1176 275017984 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1177 137508992 : q3 = _AVX_FMA(x3, h1, q3);
1178 137508992 : q3 = _AVX_FMA(y3, h2, q3);
1179 137508992 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
1180 275017984 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
1181 137508992 : q4 = _AVX_FMA(x4, h1, q4);
1182 137508992 : q4 = _AVX_FMA(y4, h2, q4);
1183 137508992 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
1184 :
1185 : #else
1186 : q1 = _AVX_LOAD(&q[i*ldq]);
1187 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
1188 : _AVX_STORE(&q[i*ldq],q1);
1189 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1190 : q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
1191 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1192 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1193 : q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
1194 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
1195 : q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
1196 : q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
1197 : _AVX_STORE(&q[(i*ldq)+3*offset],q4);
1198 :
1199 : #endif
1200 : }
1201 :
1202 4817792 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1203 : #ifdef __ELPA_USE_FMA__
1204 4817792 : q1 = _AVX_LOAD(&q[nb*ldq]);
1205 2408896 : q1 = _AVX_FMA(x1, h1, q1);
1206 2408896 : _AVX_STORE(&q[nb*ldq],q1);
1207 4817792 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1208 2408896 : q2 = _AVX_FMA(x2, h1, q2);
1209 2408896 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1210 4817792 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1211 2408896 : q3 = _AVX_FMA(x3, h1, q3);
1212 2408896 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
1213 4817792 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
1214 2408896 : q4 = _AVX_FMA(x4, h1, q4);
1215 2408896 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
1216 :
1217 : #else
1218 : q1 = _AVX_LOAD(&q[nb*ldq]);
1219 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1220 : _AVX_STORE(&q[nb*ldq],q1);
1221 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1222 : q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1223 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1224 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1225 : q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
1226 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
1227 : q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
1228 : q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
1229 : _AVX_STORE(&q[(nb*ldq)+3*offset],q4);
1230 :
1231 : #endif
1232 : }
1233 :
1234 :
1235 : /**
1236 : * Unrolled kernel that computes
1237 : #ifdef DOUBLE_PRECISION_REAL
1238 : * 12 rows of Q simultaneously, a
1239 : #endif
1240 : #ifdef SINGLE_PRECISION_REAL
1241 : * 24 rows of Q simultaneously, a
1242 : #endif
1243 : * matrix Vector product with two householder
1244 : * vectors + a rank 2 update is performed
1245 : */
1246 : #ifdef DOUBLE_PRECISION_REAL
1247 : __forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
1248 : #endif
1249 : #ifdef SINGLE_PRECISION_REAL
1250 : __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
1251 : #endif
1252 : {
1253 : /////////////////////////////////////////////////////
1254 : // Matrix Vector Multiplication, Q [16 x nb+1] * hh
1255 : // hh contains two householder vectors, with offset 1
1256 : /////////////////////////////////////////////////////
1257 : int i;
1258 : // Needed bit mask for floating point sign flip
1259 : #ifdef DOUBLE_PRECISION_REAL
1260 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
1261 : #endif
1262 : #ifdef SINGLE_PRECISION_REAL
1263 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
1264 : #endif
1265 :
1266 0 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
1267 0 : __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
1268 0 : __AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
1269 :
1270 0 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
1271 : __AVX_DATATYPE h2;
1272 :
1273 : #ifdef __ELPA_USE_FMA__
1274 0 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1275 0 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
1276 0 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1277 0 : __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
1278 0 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
1279 0 : __AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
1280 :
1281 : #else
1282 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1283 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1284 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1285 : __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1286 : __AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
1287 : __AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
1288 : #endif
1289 :
1290 0 : for(i = 2; i < nb; i++)
1291 : {
1292 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1293 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1294 : #ifdef __ELPA_USE_FMA__
1295 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1296 0 : x1 = _AVX_FMA(q1, h1, x1);
1297 0 : y1 = _AVX_FMA(q1, h2, y1);
1298 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1299 0 : x2 = _AVX_FMA(q2, h1, x2);
1300 0 : y2 = _AVX_FMA(q2, h2, y2);
1301 0 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1302 0 : x3 = _AVX_FMA(q3, h1, x3);
1303 0 : y3 = _AVX_FMA(q3, h2, y3);
1304 :
1305 : #else
1306 : q1 = _AVX_LOAD(&q[i*ldq]);
1307 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1308 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1309 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1310 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1311 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
1312 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1313 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
1314 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
1315 :
1316 : #endif
1317 : }
1318 :
1319 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1320 : #ifdef __ELPA_USE_FMA__
1321 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1322 0 : x1 = _AVX_FMA(q1, h1, x1);
1323 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1324 0 : x2 = _AVX_FMA(q2, h1, x2);
1325 0 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1326 0 : x3 = _AVX_FMA(q3, h1, x3);
1327 :
1328 : #else
1329 : q1 = _AVX_LOAD(&q[nb*ldq]);
1330 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1331 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1332 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1333 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1334 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
1335 :
1336 : #endif
1337 :
1338 : /////////////////////////////////////////////////////
1339 : // Rank-2 update of Q [16 x nb+1]
1340 : /////////////////////////////////////////////////////
1341 :
1342 0 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
1343 0 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1344 0 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
1345 :
1346 0 : h1 = _AVX_XOR(tau1, sign);
1347 0 : x1 = _AVX_MUL(x1, h1);
1348 0 : x2 = _AVX_MUL(x2, h1);
1349 0 : x3 = _AVX_MUL(x3, h1);
1350 0 : h1 = _AVX_XOR(tau2, sign);
1351 0 : h2 = _AVX_MUL(h1, vs);
1352 : #ifdef __ELPA_USE_FMA__
1353 0 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
1354 0 : y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
1355 0 : y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
1356 :
1357 : #else
1358 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1359 : y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
1360 : y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
1361 :
1362 : #endif
1363 :
1364 0 : q1 = _AVX_LOAD(q);
1365 0 : q1 = _AVX_ADD(q1, y1);
1366 : _AVX_STORE(q,q1);
1367 0 : q2 = _AVX_LOAD(&q[offset]);
1368 0 : q2 = _AVX_ADD(q2, y2);
1369 0 : _AVX_STORE(&q[offset],q2);
1370 0 : q3 = _AVX_LOAD(&q[2*offset]);
1371 0 : q3 = _AVX_ADD(q3, y3);
1372 0 : _AVX_STORE(&q[2*offset],q3);
1373 :
1374 0 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1375 : #ifdef __ELPA_USE_FMA__
1376 0 : q1 = _AVX_LOAD(&q[ldq]);
1377 0 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
1378 0 : _AVX_STORE(&q[ldq],q1);
1379 0 : q2 = _AVX_LOAD(&q[ldq+offset]);
1380 0 : q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
1381 0 : _AVX_STORE(&q[ldq+offset],q2);
1382 0 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
1383 0 : q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
1384 0 : _AVX_STORE(&q[ldq+2*offset],q3);
1385 :
1386 : #else
1387 : q1 = _AVX_LOAD(&q[ldq]);
1388 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
1389 : _AVX_STORE(&q[ldq],q1);
1390 : q2 = _AVX_LOAD(&q[ldq+offset]);
1391 : q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
1392 : _AVX_STORE(&q[ldq+offset],q2);
1393 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
1394 : q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
1395 : _AVX_STORE(&q[ldq+2*offset],q3);
1396 :
1397 : #endif
1398 :
1399 0 : for (i = 2; i < nb; i++)
1400 : {
1401 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1402 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1403 : #ifdef __ELPA_USE_FMA__
1404 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1405 0 : q1 = _AVX_FMA(x1, h1, q1);
1406 0 : q1 = _AVX_FMA(y1, h2, q1);
1407 0 : _AVX_STORE(&q[i*ldq],q1);
1408 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1409 0 : q2 = _AVX_FMA(x2, h1, q2);
1410 0 : q2 = _AVX_FMA(y2, h2, q2);
1411 0 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1412 0 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1413 0 : q3 = _AVX_FMA(x3, h1, q3);
1414 0 : q3 = _AVX_FMA(y3, h2, q3);
1415 0 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
1416 :
1417 : #else
1418 : q1 = _AVX_LOAD(&q[i*ldq]);
1419 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
1420 : _AVX_STORE(&q[i*ldq],q1);
1421 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1422 : q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
1423 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1424 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
1425 : q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
1426 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
1427 :
1428 : #endif
1429 : }
1430 :
1431 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1432 : #ifdef __ELPA_USE_FMA__
1433 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1434 0 : q1 = _AVX_FMA(x1, h1, q1);
1435 0 : _AVX_STORE(&q[nb*ldq],q1);
1436 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1437 0 : q2 = _AVX_FMA(x2, h1, q2);
1438 0 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1439 0 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1440 0 : q3 = _AVX_FMA(x3, h1, q3);
1441 0 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
1442 :
1443 : #else
1444 : q1 = _AVX_LOAD(&q[nb*ldq]);
1445 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1446 : _AVX_STORE(&q[nb*ldq],q1);
1447 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1448 : q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1449 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1450 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
1451 : q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
1452 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
1453 :
1454 : #endif
1455 : }
1456 :
1457 : /**
1458 : * Unrolled kernel that computes
1459 : #ifdef DOUBLE_PRECISION_REAL
1460 : * 8 rows of Q simultaneously, a
1461 : #endif
1462 : #ifdef SINGLE_PRECISION_REAL
1463 : * 16 rows of Q simultaneously, a
1464 : #endif
1465 : * matrix Vector product with two householder
1466 : * vectors + a rank 2 update is performed
1467 : */
1468 : #ifdef DOUBLE_PRECISION_REAL
1469 : __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
1470 : #endif
1471 : #ifdef SINGLE_PRECISION_REAL
1472 : __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
1473 : #endif
1474 : {
1475 : /////////////////////////////////////////////////////
1476 : // Matrix Vector Multiplication, Q [8 x nb+1] * hh
1477 : // hh contains two householder vectors, with offset 1
1478 : /////////////////////////////////////////////////////
1479 : int i;
1480 : // Needed bit mask for floating point sign flip
1481 : #ifdef DOUBLE_PRECISION_REAL
1482 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
1483 : #endif
1484 : #ifdef SINGLE_PRECISION_REAL
1485 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi32(0x80000000);
1486 : #endif
1487 :
1488 0 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
1489 0 : __AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
1490 0 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
1491 : __AVX_DATATYPE h2;
1492 :
1493 : #ifdef __ELPA_USE_FMA__
1494 0 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1495 0 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
1496 0 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1497 0 : __AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
1498 :
1499 : #else
1500 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1501 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1502 : __AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
1503 : __AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1504 :
1505 : #endif
1506 :
1507 0 : for(i = 2; i < nb; i++)
1508 : {
1509 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1510 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1511 : #ifdef __ELPA_USE_FMA__
1512 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1513 0 : x1 = _AVX_FMA(q1, h1, x1);
1514 0 : y1 = _AVX_FMA(q1, h2, y1);
1515 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1516 0 : x2 = _AVX_FMA(q2, h1, x2);
1517 0 : y2 = _AVX_FMA(q2, h2, y2);
1518 :
1519 : #else
1520 : q1 = _AVX_LOAD(&q[i*ldq]);
1521 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1522 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1523 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1524 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1525 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
1526 :
1527 : #endif
1528 : }
1529 :
1530 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1531 : #ifdef __ELPA_USE_FMA__
1532 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1533 0 : x1 = _AVX_FMA(q1, h1, x1);
1534 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1535 0 : x2 = _AVX_FMA(q2, h1, x2);
1536 :
1537 : #else
1538 : q1 = _AVX_LOAD(&q[nb*ldq]);
1539 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1540 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1541 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1542 :
1543 : #endif
1544 :
1545 : /////////////////////////////////////////////////////
1546 : // Rank-2 update of Q [8 x nb+1]
1547 : /////////////////////////////////////////////////////
1548 :
1549 0 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
1550 0 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1551 0 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
1552 :
1553 0 : h1 = _AVX_XOR(tau1, sign);
1554 0 : x1 = _AVX_MUL(x1, h1);
1555 0 : x2 = _AVX_MUL(x2, h1);
1556 0 : h1 = _AVX_XOR(tau2, sign);
1557 0 : h2 = _AVX_MUL(h1, vs);
1558 : #ifdef __ELPA_USE_FMA__
1559 0 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
1560 0 : y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
1561 :
1562 : #else
1563 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1564 : y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
1565 :
1566 : #endif
1567 :
1568 0 : q1 = _AVX_LOAD(q);
1569 0 : q1 = _AVX_ADD(q1, y1);
1570 : _AVX_STORE(q,q1);
1571 0 : q2 = _AVX_LOAD(&q[offset]);
1572 0 : q2 = _AVX_ADD(q2, y2);
1573 0 : _AVX_STORE(&q[offset],q2);
1574 :
1575 0 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1576 : #ifdef __ELPA_USE_FMA__
1577 0 : q1 = _AVX_LOAD(&q[ldq]);
1578 0 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
1579 0 : _AVX_STORE(&q[ldq],q1);
1580 0 : q2 = _AVX_LOAD(&q[ldq+offset]);
1581 0 : q2 = _AVX_ADD(q2, _AVX_FMA(y2, h2, x2));
1582 0 : _AVX_STORE(&q[ldq+offset],q2);
1583 :
1584 : #else
1585 : q1 = _AVX_LOAD(&q[ldq]);
1586 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
1587 : _AVX_STORE(&q[ldq],q1);
1588 : q2 = _AVX_LOAD(&q[ldq+offset]);
1589 : q2 = _AVX_ADD(q2, _AVX_ADD(x2, _AVX_MUL(y2, h2)));
1590 : _AVX_STORE(&q[ldq+offset],q2);
1591 :
1592 : #endif
1593 :
1594 0 : for (i = 2; i < nb; i++)
1595 : {
1596 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1597 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1598 : #ifdef __ELPA_USE_FMA__
1599 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1600 0 : q1 = _AVX_FMA(x1, h1, q1);
1601 0 : q1 = _AVX_FMA(y1, h2, q1);
1602 0 : _AVX_STORE(&q[i*ldq],q1);
1603 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1604 0 : q2 = _AVX_FMA(x2, h1, q2);
1605 0 : q2 = _AVX_FMA(y2, h2, q2);
1606 0 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1607 :
1608 : #else
1609 : q1 = _AVX_LOAD(&q[i*ldq]);
1610 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
1611 : _AVX_STORE(&q[i*ldq],q1);
1612 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1613 : q2 = _AVX_ADD(q2, _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2)));
1614 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1615 :
1616 : #endif
1617 : }
1618 :
1619 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1620 : #ifdef __ELPA_USE_FMA__
1621 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1622 0 : q1 = _AVX_FMA(x1, h1, q1);
1623 0 : _AVX_STORE(&q[nb*ldq],q1);
1624 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1625 0 : q2 = _AVX_FMA(x2, h1, q2);
1626 0 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1627 :
1628 : #else
1629 : q1 = _AVX_LOAD(&q[nb*ldq]);
1630 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1631 : _AVX_STORE(&q[nb*ldq],q1);
1632 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1633 : q2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
1634 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1635 :
1636 : #endif
1637 : }
1638 :
1639 : /**
1640 : * Unrolled kernel that computes
1641 : #ifdef DOUBLE_PRECISION_REAL
1642 : * 4 rows of Q simultaneously, a
1643 : #endif
1644 : #ifdef SINGLE_PRECISION_REAL
1645 : * 8 rows of Q simultaneously, a
1646 : #endif
1647 : * matrix Vector product with two householder
1648 : * vectors + a rank 2 update is performed
1649 : */
1650 : #ifdef DOUBLE_PRECISION_REAL
1651 : __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
1652 : #endif
1653 : #ifdef SINGLE_PRECISION_REAL
1654 : __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
1655 : #endif
1656 : {
1657 : /////////////////////////////////////////////////////
1658 : // Matrix Vector Multiplication, Q [4 x nb+1] * hh
1659 : // hh contains two householder vectors, with offset 1
1660 : /////////////////////////////////////////////////////
1661 : int i;
1662 : // Needed bit mask for floating point sign flip
1663 : #ifdef DOUBLE_PRECISION_REAL
1664 0 : __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
1665 : #endif
1666 : #ifdef SINGLE_PRECISION_REAL
1667 0 : __m256 sign = (__m256)_mm256_set1_epi32(0x80000000);
1668 : #endif
1669 :
1670 0 : __AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
1671 0 : __AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
1672 : __AVX_DATATYPE h2;
1673 :
1674 : #ifdef __ELPA_USE_FMA__
1675 :
1676 0 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1677 0 : __AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
1678 : #else
1679 :
1680 : __AVX_DATATYPE q1 = _AVX_LOAD(q);
1681 : __AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1682 : #endif
1683 :
1684 0 : for(i = 2; i < nb; i++)
1685 : {
1686 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1687 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1688 : #ifdef __ELPA_USE_FMA__
1689 :
1690 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1691 0 : x1 = _AVX_FMA(q1, h1, x1);
1692 0 : y1 = _AVX_FMA(q1, h2, y1);
1693 : #else
1694 :
1695 : q1 = _AVX_LOAD(&q[i*ldq]);
1696 :
1697 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1698 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1699 : #endif
1700 : }
1701 :
1702 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1703 : #ifdef __ELPA_USE_FMA__
1704 :
1705 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1706 0 : x1 = _AVX_FMA(q1, h1, x1);
1707 : #else
1708 :
1709 : q1 = _AVX_LOAD(&q[nb*ldq]);
1710 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1711 : #endif
1712 :
1713 : /////////////////////////////////////////////////////
1714 : // Rank-2 update of Q [4 x nb+1]
1715 : /////////////////////////////////////////////////////
1716 :
1717 0 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
1718 0 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1719 0 : __AVX_DATATYPE vs = _AVX_BROADCAST(&s);
1720 :
1721 0 : h1 = _AVX_XOR(tau1, sign);
1722 0 : x1 = _AVX_MUL(x1, h1);
1723 0 : h1 = _AVX_XOR(tau2, sign);
1724 0 : h2 = _AVX_MUL(h1, vs);
1725 : #ifdef __ELPA_USE_FMA__
1726 0 : y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
1727 : #else
1728 : y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1729 : #endif
1730 :
1731 0 : q1 = _AVX_LOAD(q);
1732 0 : q1 = _AVX_ADD(q1, y1);
1733 : _AVX_STORE(q,q1);
1734 0 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1735 : #ifdef __ELPA_USE_FMA__
1736 :
1737 0 : q1 = _AVX_LOAD(&q[ldq]);
1738 0 : q1 = _AVX_ADD(q1, _AVX_FMA(y1, h2, x1));
1739 :
1740 0 : _AVX_STORE(&q[ldq],q1);
1741 :
1742 : #else
1743 :
1744 : q1 = _AVX_LOAD(&q[ldq]);
1745 : q1 = _AVX_ADD(q1, _AVX_ADD(x1, _AVX_MUL(y1, h2)));
1746 : _AVX_STORE(&q[ldq],q1);
1747 :
1748 : #endif
1749 :
1750 0 : for (i = 2; i < nb; i++)
1751 : {
1752 0 : h1 = _AVX_BROADCAST(&hh[i-1]);
1753 0 : h2 = _AVX_BROADCAST(&hh[ldh+i]);
1754 : #ifdef __ELPA_USE_FMA__
1755 :
1756 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1757 0 : q1 = _AVX_FMA(x1, h1, q1);
1758 0 : q1 = _AVX_FMA(y1, h2, q1);
1759 0 : _AVX_STORE(&q[i*ldq],q1);
1760 :
1761 : #else
1762 : q1 = _AVX_LOAD(&q[i*ldq]);
1763 :
1764 : q1 = _AVX_ADD(q1, _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2)));
1765 : _AVX_STORE(&q[i*ldq],q1);
1766 :
1767 : #endif
1768 : }
1769 :
1770 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1771 : #ifdef __ELPA_USE_FMA__
1772 :
1773 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1774 0 : q1 = _AVX_FMA(x1, h1, q1);
1775 0 : _AVX_STORE(&q[nb*ldq],q1);
1776 :
1777 : #else
1778 :
1779 : q1 = _AVX_LOAD(&q[nb*ldq]);
1780 : q1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
1781 : _AVX_STORE(&q[nb*ldq],q1);
1782 : #endif
1783 : }
|