Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 : #include "config-f90.h"
63 :
64 : #include <x86intrin.h>
65 : #include <stdio.h>
66 : #include <stdlib.h>
67 :
68 : #define __forceinline __attribute__((always_inline)) static
69 :
70 : #ifdef DOUBLE_PRECISION_REAL
71 : #define offset 4
72 : #define __AVX_DATATYPE __m256d
73 : #define _AVX_LOAD _mm256_load_pd
74 : #define _AVX_STORE _mm256_store_pd
75 : #define _AVX_ADD _mm256_add_pd
76 : #define _AVX_SUB _mm256_sub_pd
77 : #define _AVX_MUL _mm256_mul_pd
78 : #define _AVX_BROADCAST _mm256_broadcast_sd
79 :
80 : #ifdef HAVE_AVX2
81 :
82 : #ifdef __FMA4__
83 : #define __ELPA_USE_FMA__
84 : #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
85 : #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
86 : #error "This should be prop _mm256_msub_pd instead of _mm256_msub"
87 : #define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
88 : #endif
89 :
90 : #ifdef __AVX2__
91 : #define __ELPA_USE_FMA__
92 : #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
93 : #define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
94 : #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
95 : #endif
96 :
97 : #endif
98 :
99 : #define _AVX_FMA _mm256_FMA_pd
100 : #define _AVX_NFMA _mm256_NFMA_pd
101 : #define _AVX_FMSUB _mm256_FMSUB_pd
102 : #endif /* DOUBLE_PRECISION */
103 :
104 : #ifdef SINGLE_PRECISION_REAL
105 : #define offset 8
106 : #define __AVX_DATATYPE __m256
107 : #define _AVX_LOAD _mm256_load_ps
108 : #define _AVX_STORE _mm256_store_ps
109 : #define _AVX_ADD _mm256_add_ps
110 : #define _AVX_SUB _mm256_sub_ps
111 : #define _AVX_MUL _mm256_mul_ps
112 : #define _AVX_BROADCAST _mm256_broadcast_ss
113 :
114 : #ifdef HAVE_AVX2
115 :
116 : #ifdef __FMA4__
117 : #define __ELPA_USE_FMA__
118 : #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
119 : #define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
120 : #error "This should be prop _mm256_msub_ps instead of _mm256_msub"
121 : #define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
122 : #endif
123 :
124 : #ifdef __AVX2__
125 : #define __ELPA_USE_FMA__
126 : #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
127 : #define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
128 : #define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
129 : #endif
130 :
131 : #endif
132 : #define _AVX_FMA _mm256_FMA_ps
133 : #define _AVX_NFMA _mm256_NFMA_ps
134 : #define _AVX_FMSUB _mm256_FMSUB_ps
135 : #endif /* SINGLE_PRECISION */
136 :
137 : #ifdef DOUBLE_PRECISION_REAL
138 : //Forward declaration
139 : __forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
140 : __forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
141 : __forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
142 :
143 : void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
144 : #endif
145 : #ifdef SINGLE_PRECISION_REAL
146 : //Forward declaration
147 : __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
148 : __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
149 : __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
150 :
151 : void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
152 : #endif
153 :
154 : #ifdef DOUBLE_PRECISION_REAL
155 : /*
156 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
157 : !f> interface
158 : !f> subroutine quad_hh_trafo_real_avx_avx2_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
159 : !f> bind(C, name="quad_hh_trafo_real_avx_avx2_4hv_double")
160 : !f> use, intrinsic :: iso_c_binding
161 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
162 : !f> type(c_ptr), value :: q
163 : !f> real(kind=c_double) :: hh(pnb,6)
164 : !f> end subroutine
165 : !f> end interface
166 : !f>#endif
167 : */
168 : #endif
169 : #ifdef SINGLE_PRECISION_REAL
170 : /*
171 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
172 : !f> interface
173 : !f> subroutine quad_hh_trafo_real_avx_avx2_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
174 : !f> bind(C, name="quad_hh_trafo_real_avx_avx2_4hv_single")
175 : !f> use, intrinsic :: iso_c_binding
176 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
177 : !f> type(c_ptr), value :: q
178 : !f> real(kind=c_float) :: hh(pnb,6)
179 : !f> end subroutine
180 : !f> end interface
181 : !f>#endif
182 : */
183 : #endif
184 :
185 :
186 : #ifdef DOUBLE_PRECISION_REAL
187 536576 : void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
188 : #endif
189 : #ifdef SINGLE_PRECISION_REAL
190 100608 : void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
191 : #endif
192 : {
193 : int i;
194 637184 : int nb = *pnb;
195 637184 : int nq = *pldq;
196 637184 : int ldq = *pldq;
197 637184 : int ldh = *pldh;
198 : int worked_on;
199 :
200 637184 : worked_on = 0;
201 :
202 : // calculating scalar products to compute
203 : // 4 householder vectors simultaneously
204 : #ifdef DOUBLE_PRECISION_REAL
205 536576 : double s_1_2 = hh[(ldh)+1];
206 536576 : double s_1_3 = hh[(ldh*2)+2];
207 536576 : double s_2_3 = hh[(ldh*2)+1];
208 536576 : double s_1_4 = hh[(ldh*3)+3];
209 536576 : double s_2_4 = hh[(ldh*3)+2];
210 536576 : double s_3_4 = hh[(ldh*3)+1];
211 : #endif
212 : #ifdef SINGLE_PRECISION_REAL
213 100608 : float s_1_2 = hh[(ldh)+1];
214 100608 : float s_1_3 = hh[(ldh*2)+2];
215 100608 : float s_2_3 = hh[(ldh*2)+1];
216 100608 : float s_1_4 = hh[(ldh*3)+3];
217 100608 : float s_2_4 = hh[(ldh*3)+2];
218 100608 : float s_3_4 = hh[(ldh*3)+1];
219 : #endif
220 :
221 : // calculate scalar product of first and fourth householder Vector
222 : // loop counter = 2
223 637184 : s_1_2 += hh[2-1] * hh[(2+ldh)];
224 637184 : s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
225 637184 : s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
226 :
227 : // loop counter = 3
228 637184 : s_1_2 += hh[3-1] * hh[(3+ldh)];
229 637184 : s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
230 637184 : s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
231 :
232 637184 : s_1_3 += hh[3-2] * hh[3+(ldh*2)];
233 637184 : s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
234 :
235 : #pragma ivdep
236 38868224 : for (i = 4; i < nb; i++)
237 : {
238 38231040 : s_1_2 += hh[i-1] * hh[(i+ldh)];
239 38231040 : s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
240 38231040 : s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
241 :
242 38231040 : s_1_3 += hh[i-2] * hh[i+(ldh*2)];
243 38231040 : s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
244 :
245 38231040 : s_1_4 += hh[i-3] * hh[i+(ldh*3)];
246 : }
247 :
248 : // Production level kernel calls with padding
249 : #ifdef DOUBLE_PRECISION_REAL
250 2146304 : for (i = 0; i < nq-8; i+=12)
251 : {
252 1609728 : hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
253 1609728 : worked_on += 12;
254 : }
255 : #endif
256 : #ifdef SINGLE_PRECISION_REAL
257 402432 : for (i = 0; i < nq-16; i+=24)
258 : {
259 301824 : hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
260 301824 : worked_on += 24;
261 : }
262 : #endif
263 637184 : if (nq == i)
264 : {
265 0 : return;
266 : }
267 :
268 : #ifdef DOUBLE_PRECISION_REAL
269 536576 : if (nq-i == 8)
270 : {
271 0 : hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
272 0 : worked_on += 8;
273 : }
274 : #endif
275 :
276 : #ifdef SINGLE_PRECISION_REAL
277 100608 : if (nq-i == 16)
278 : {
279 0 : hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
280 0 : worked_on += 16;
281 : }
282 : #endif
283 :
284 : #ifdef DOUBLE_PRECISION_REAL
285 536576 : if (nq-i == 4)
286 : {
287 536576 : hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
288 536576 : worked_on += 4;
289 : }
290 : #endif
291 :
292 : #ifdef SINGLE_PRECISION_REAL
293 100608 : if (nq-i == 8)
294 : {
295 100608 : hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
296 100608 : worked_on += 8;
297 : }
298 : #endif
299 :
300 : #ifdef WITH_DEBUG
301 : if (worked_on != nq)
302 : {
303 : printf("Error in real AVX/AVX2 BLOCK4 kernel \n");
304 : abort();
305 : }
306 : #endif
307 : }
308 : /**
309 : * Unrolled kernel that computes
310 : #ifdef DOUBLE_PRECISION_REAL
311 : * 12 rows of Q simultaneously, a
312 : #endif
313 : #ifdef SINGLE_PRECISION_REAL
314 : * 24 rows of Q simultaneously, a
315 : #endif
316 : * matrix Vector product with two householder
317 : * vectors + a rank 1 update is performed
318 : */
319 : #ifdef DOUBLE_PRECISION_REAL
320 : __forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
321 : #endif
322 : #ifdef SINGLE_PRECISION_REAL
323 : __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
324 : #endif
325 : {
326 : /////////////////////////////////////////////////////
327 : // Matrix Vector Multiplication, Q [12 x nb+3] * hh
328 : // hh contains four householder vectors
329 : /////////////////////////////////////////////////////
330 : int i;
331 :
332 3823104 : __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*3]);
333 3823104 : __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*2]);
334 3823104 : __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq]);
335 1911552 : __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[0]);
336 :
337 3823104 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
338 3823104 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
339 3823104 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
340 3823104 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
341 3823104 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
342 3823104 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
343 :
344 : #ifdef __ELPA_USE_FMA__
345 1911552 : register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
346 1911552 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
347 1911552 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
348 1911552 : register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
349 1911552 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
350 1911552 : register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
351 1911552 : register __AVX_DATATYPE x1 = a1_1;
352 : #else
353 : register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
354 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
355 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
356 : register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
357 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
358 : register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
359 : register __AVX_DATATYPE x1 = a1_1;
360 : #endif
361 :
362 3823104 : __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
363 3823104 : __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
364 3823104 : __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[ldq+offset]);
365 3823104 : __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[0+offset]);
366 :
367 : #ifdef __ELPA_USE_FMA__
368 1911552 : register __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
369 1911552 : w2 = _AVX_FMA(a2_2, h_4_2, w2);
370 1911552 : w2 = _AVX_FMA(a1_2, h_4_1, w2);
371 1911552 : register __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
372 1911552 : z2 = _AVX_FMA(a1_2, h_3_1, z2);
373 1911552 : register __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
374 1911552 : register __AVX_DATATYPE x2 = a1_2;
375 : #else
376 : register __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
377 : w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
378 : w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
379 : register __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
380 : z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
381 : register __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
382 : register __AVX_DATATYPE x2 = a1_2;
383 : #endif
384 :
385 3823104 : __AVX_DATATYPE a1_3 = _AVX_LOAD(&q[(ldq*3)+2*offset]);
386 3823104 : __AVX_DATATYPE a2_3 = _AVX_LOAD(&q[(ldq*2)+2*offset]);
387 3823104 : __AVX_DATATYPE a3_3 = _AVX_LOAD(&q[ldq+2*offset]);
388 3823104 : __AVX_DATATYPE a4_3 = _AVX_LOAD(&q[0+2*offset]);
389 :
390 : #ifdef __ELPA_USE_FMA__
391 1911552 : register __AVX_DATATYPE w3 = _AVX_FMA(a3_3, h_4_3, a4_3);
392 1911552 : w3 = _AVX_FMA(a2_3, h_4_2, w3);
393 1911552 : w3 = _AVX_FMA(a1_3, h_4_1, w3);
394 1911552 : register __AVX_DATATYPE z3 = _AVX_FMA(a2_3, h_3_2, a3_3);
395 1911552 : z3 = _AVX_FMA(a1_3, h_3_1, z3);
396 1911552 : register __AVX_DATATYPE y3 = _AVX_FMA(a1_3, h_2_1, a2_3);
397 1911552 : register __AVX_DATATYPE x3 = a1_3;
398 : #else
399 : register __AVX_DATATYPE w3 = _AVX_ADD(a4_3, _AVX_MUL(a3_3, h_4_3));
400 : w3 = _AVX_ADD(w3, _AVX_MUL(a2_3, h_4_2));
401 : w3 = _AVX_ADD(w3, _AVX_MUL(a1_3, h_4_1));
402 : register __AVX_DATATYPE z3 = _AVX_ADD(a3_3, _AVX_MUL(a2_3, h_3_2));
403 : z3 = _AVX_ADD(z3, _AVX_MUL(a1_3, h_3_1));
404 : register __AVX_DATATYPE y3 = _AVX_ADD(a2_3, _AVX_MUL(a1_3, h_2_1));
405 : register __AVX_DATATYPE x3 = a1_3;
406 : #endif
407 :
408 : __AVX_DATATYPE q1;
409 : __AVX_DATATYPE q2;
410 : __AVX_DATATYPE q3;
411 :
412 : __AVX_DATATYPE h1;
413 : __AVX_DATATYPE h2;
414 : __AVX_DATATYPE h3;
415 : __AVX_DATATYPE h4;
416 :
417 116604672 : for(i = 4; i < nb; i++)
418 : {
419 229386240 : h1 = _AVX_BROADCAST(&hh[i-3]);
420 229386240 : q1 = _AVX_LOAD(&q[i*ldq]);
421 229386240 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
422 229386240 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
423 : #ifdef __ELPA_USE_FMA__
424 114693120 : x1 = _AVX_FMA(q1, h1, x1);
425 114693120 : x2 = _AVX_FMA(q2, h1, x2);
426 114693120 : x3 = _AVX_FMA(q3, h1, x3);
427 : #else
428 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
429 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
430 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
431 : #endif
432 :
433 229386240 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
434 : #ifdef __ELPA_USE_FMA__
435 114693120 : y1 = _AVX_FMA(q1, h2, y1);
436 114693120 : y2 = _AVX_FMA(q2, h2, y2);
437 114693120 : y3 = _AVX_FMA(q3, h2, y3);
438 : #else
439 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
440 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
441 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
442 : #endif
443 :
444 229386240 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
445 : #ifdef __ELPA_USE_FMA__
446 114693120 : z1 = _AVX_FMA(q1, h3, z1);
447 114693120 : z2 = _AVX_FMA(q2, h3, z2);
448 114693120 : z3 = _AVX_FMA(q3, h3, z3);
449 : #else
450 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
451 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
452 : z3 = _AVX_ADD(z3, _AVX_MUL(q3,h3));
453 : #endif
454 :
455 229386240 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
456 : #ifdef __ELPA_USE_FMA__
457 114693120 : w1 = _AVX_FMA(q1, h4, w1);
458 114693120 : w2 = _AVX_FMA(q2, h4, w2);
459 114693120 : w3 = _AVX_FMA(q3, h4, w3);
460 : #else
461 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
462 : w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
463 : w3 = _AVX_ADD(w3, _AVX_MUL(q3,h4));
464 : #endif
465 : }
466 :
467 3823104 : h1 = _AVX_BROADCAST(&hh[nb-3]);
468 :
469 3823104 : q1 = _AVX_LOAD(&q[nb*ldq]);
470 3823104 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
471 3823104 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
472 :
473 : #ifdef __ELPA_USE_FMA__
474 1911552 : x1 = _AVX_FMA(q1, h1, x1);
475 1911552 : x2 = _AVX_FMA(q2, h1, x2);
476 1911552 : x3 = _AVX_FMA(q3, h1, x3);
477 : #else
478 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
479 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
480 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
481 : #endif
482 :
483 3823104 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
484 : #ifdef __FMA4_
485 : y1 = _AVX_FMA(q1, h2, y1);
486 : y2 = _AVX_FMA(q2, h2, y2);
487 : y3 = _AVX_FMA(q3, h2, y3);
488 : #else
489 3823104 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
490 3823104 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
491 3823104 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
492 : #endif
493 :
494 3823104 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
495 : #ifdef __ELPA_USE_FMA__
496 1911552 : z1 = _AVX_FMA(q1, h3, z1);
497 1911552 : z2 = _AVX_FMA(q2, h3, z2);
498 1911552 : z3 = _AVX_FMA(q3, h3, z3);
499 : #else
500 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
501 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
502 : z3 = _AVX_ADD(z3, _AVX_MUL(q3,h3));
503 : #endif
504 :
505 3823104 : h1 = _AVX_BROADCAST(&hh[nb-2]);
506 :
507 3823104 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
508 3823104 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
509 3823104 : q3 = _AVX_LOAD(&q[((nb+1)*ldq)+2*offset]);
510 :
511 : #ifdef __ELPA_USE_FMA__
512 1911552 : x1 = _AVX_FMA(q1, h1, x1);
513 1911552 : x2 = _AVX_FMA(q2, h1, x2);
514 1911552 : x3 = _AVX_FMA(q3, h1, x3);
515 : #else
516 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
517 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
518 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
519 : #endif
520 :
521 3823104 : h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
522 :
523 : #ifdef __ELPA_USE_FMA__
524 1911552 : y1 = _AVX_FMA(q1, h2, y1);
525 1911552 : y2 = _AVX_FMA(q2, h2, y2);
526 1911552 : y3 = _AVX_FMA(q3, h2, y3);
527 : #else
528 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
529 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
530 : y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
531 : #endif
532 :
533 3823104 : h1 = _AVX_BROADCAST(&hh[nb-1]);
534 :
535 3823104 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
536 3823104 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
537 3823104 : q3 = _AVX_LOAD(&q[((nb+2)*ldq)+2*offset]);
538 :
539 : #ifdef __ELPA_USE_FMA__
540 1911552 : x1 = _AVX_FMA(q1, h1, x1);
541 1911552 : x2 = _AVX_FMA(q2, h1, x2);
542 1911552 : x3 = _AVX_FMA(q3, h1, x3);
543 : #else
544 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
545 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
546 : x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
547 : #endif
548 :
549 : /////////////////////////////////////////////////////
550 : // Rank-1 update of Q [12 x nb+3]
551 : /////////////////////////////////////////////////////
552 :
553 1911552 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
554 :
555 1911552 : h1 = tau1;
556 1911552 : x1 = _AVX_MUL(x1, h1);
557 1911552 : x2 = _AVX_MUL(x2, h1);
558 1911552 : x3 = _AVX_MUL(x3, h1);
559 :
560 3823104 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
561 1911552 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&s_1_2);
562 :
563 1911552 : h1 = tau2;
564 1911552 : h2 = _AVX_MUL(h1, vs_1_2);
565 : #ifdef __ELPA_USE_FMA__
566 3823104 : y1 = _AVX_FMSUB(y1, h1, _AVX_MUL(x1,h2));
567 3823104 : y2 = _AVX_FMSUB(y2, h1, _AVX_MUL(x2,h2));
568 3823104 : y3 = _AVX_FMSUB(y3, h1, _AVX_MUL(x3,h2));
569 : #else
570 : y1 = _AVX_SUB(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
571 : y2 = _AVX_SUB(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
572 : y3 = _AVX_SUB(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
573 : #endif
574 :
575 3823104 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
576 1911552 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&s_1_3);
577 1911552 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&s_2_3);
578 :
579 1911552 : h1 = tau3;
580 1911552 : h2 = _AVX_MUL(h1, vs_1_3);
581 1911552 : h3 = _AVX_MUL(h1, vs_2_3);
582 : #ifdef __ELPA_USE_FMA__
583 5734656 : z1 = _AVX_FMSUB(z1, h1, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
584 5734656 : z2 = _AVX_FMSUB(z2, h1, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
585 5734656 : z3 = _AVX_FMSUB(z3, h1, _AVX_FMA(y3, h3, _AVX_MUL(x3,h2)));
586 : #else
587 : z1 = _AVX_SUB(_AVX_MUL(z1,h1), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
588 : z2 = _AVX_SUB(_AVX_MUL(z2,h1), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
589 : z3 = _AVX_SUB(_AVX_MUL(z3,h1), _AVX_ADD(_AVX_MUL(y3,h3), _AVX_MUL(x3,h2)));
590 : #endif
591 :
592 3823104 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
593 1911552 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&s_1_4);
594 1911552 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&s_2_4);
595 1911552 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&s_3_4);
596 :
597 1911552 : h1 = tau4;
598 1911552 : h2 = _AVX_MUL(h1, vs_1_4);
599 1911552 : h3 = _AVX_MUL(h1, vs_2_4);
600 1911552 : h4 = _AVX_MUL(h1, vs_3_4);
601 : #ifdef __ELPA_USE_FMA__
602 7646208 : w1 = _AVX_FMSUB(w1, h1, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
603 7646208 : w2 = _AVX_FMSUB(w2, h1, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
604 7646208 : w3 = _AVX_FMSUB(w3, h1, _AVX_FMA(z3, h4, _AVX_FMA(y3, h3, _AVX_MUL(x3,h2))));
605 : #else
606 : w1 = _AVX_SUB(_AVX_MUL(w1,h1), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
607 : w2 = _AVX_SUB(_AVX_MUL(w2,h1), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
608 : w3 = _AVX_SUB(_AVX_MUL(w3,h1), _AVX_ADD(_AVX_MUL(z3,h4), _AVX_ADD(_AVX_MUL(y3,h3), _AVX_MUL(x3,h2))));
609 : #endif
610 :
611 1911552 : q1 = _AVX_LOAD(&q[0]);
612 3823104 : q2 = _AVX_LOAD(&q[offset]);
613 3823104 : q3 = _AVX_LOAD(&q[2*offset]);
614 1911552 : q1 = _AVX_SUB(q1, w1);
615 1911552 : q2 = _AVX_SUB(q2, w2);
616 1911552 : q3 = _AVX_SUB(q3, w3);
617 : _AVX_STORE(&q[0],q1);
618 1911552 : _AVX_STORE(&q[offset],q2);
619 1911552 : _AVX_STORE(&q[2*offset],q3);
620 :
621 3823104 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
622 3823104 : q1 = _AVX_LOAD(&q[ldq]);
623 3823104 : q2 = _AVX_LOAD(&q[ldq+offset]);
624 3823104 : q3 = _AVX_LOAD(&q[ldq+2*offset]);
625 : #ifdef __ELPA_USE_FMA__
626 3823104 : q1 = _AVX_SUB(q1, _AVX_FMA(w1, h4, z1));
627 3823104 : q2 = _AVX_SUB(q2, _AVX_FMA(w2, h4, z2));
628 3823104 : q3 = _AVX_SUB(q3, _AVX_FMA(w3, h4, z3));
629 : #else
630 : q1 = _AVX_SUB(q1, _AVX_ADD(z1, _AVX_MUL(w1, h4)));
631 : q2 = _AVX_SUB(q2, _AVX_ADD(z2, _AVX_MUL(w2, h4)));
632 : q3 = _AVX_SUB(q3, _AVX_ADD(z3, _AVX_MUL(w3, h4)));
633 : #endif
634 1911552 : _AVX_STORE(&q[ldq],q1);
635 1911552 : _AVX_STORE(&q[ldq+offset],q2);
636 1911552 : _AVX_STORE(&q[ldq+2*offset],q3);
637 :
638 3823104 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
639 3823104 : q1 = _AVX_LOAD(&q[ldq*2]);
640 3823104 : q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
641 3823104 : q3 = _AVX_LOAD(&q[(ldq*2)+2*offset]);
642 1911552 : q1 = _AVX_SUB(q1, y1);
643 1911552 : q2 = _AVX_SUB(q2, y2);
644 1911552 : q3 = _AVX_SUB(q3, y3);
645 : #ifdef __ELPA_USE_FMA__
646 1911552 : q1 = _AVX_NFMA(w1, h4, q1);
647 1911552 : q2 = _AVX_NFMA(w2, h4, q2);
648 1911552 : q3 = _AVX_NFMA(w3, h4, q3);
649 : #else
650 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
651 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
652 : q3 = _AVX_SUB(q3, _AVX_MUL(w3, h4));
653 : #endif
654 3823104 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
655 : #ifdef __ELPA_USE_FMA__
656 1911552 : q1 = _AVX_NFMA(z1, h3, q1);
657 1911552 : q2 = _AVX_NFMA(z2, h3, q2);
658 1911552 : q3 = _AVX_NFMA(z3, h3, q3);
659 : #else
660 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
661 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
662 : q3 = _AVX_SUB(q3, _AVX_MUL(z3, h3));
663 : #endif
664 1911552 : _AVX_STORE(&q[ldq*2],q1);
665 1911552 : _AVX_STORE(&q[(ldq*2)+offset],q2);
666 1911552 : _AVX_STORE(&q[(ldq*2)+2*offset],q3);
667 :
668 3823104 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
669 3823104 : q1 = _AVX_LOAD(&q[ldq*3]);
670 3823104 : q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
671 3823104 : q3 = _AVX_LOAD(&q[(ldq*3)+2*offset]);
672 1911552 : q1 = _AVX_SUB(q1, x1);
673 1911552 : q2 = _AVX_SUB(q2, x2);
674 1911552 : q3 = _AVX_SUB(q3, x3);
675 : #ifdef __ELPA_USE_FMA__
676 1911552 : q1 = _AVX_NFMA(w1, h4, q1);
677 1911552 : q2 = _AVX_NFMA(w2, h4, q2);
678 1911552 : q3 = _AVX_NFMA(w3, h4, q3);
679 : #else
680 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
681 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
682 : q3 = _AVX_SUB(q3, _AVX_MUL(w3, h4));
683 : #endif
684 3823104 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
685 : #ifdef __ELPA_USE_FMA__
686 1911552 : q1 = _AVX_NFMA(y1, h2, q1);
687 1911552 : q2 = _AVX_NFMA(y2, h2, q2);
688 1911552 : q3 = _AVX_NFMA(y3, h2, q3);
689 : #else
690 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
691 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
692 : q3 = _AVX_SUB(q3, _AVX_MUL(y3, h2));
693 : #endif
694 3823104 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
695 : #ifdef __ELPA_USE_FMA__
696 1911552 : q1 = _AVX_NFMA(z1, h3, q1);
697 1911552 : q2 = _AVX_NFMA(z2, h3, q2);
698 1911552 : q3 = _AVX_NFMA(z3, h3, q3);
699 : #else
700 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
701 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
702 : q3 = _AVX_SUB(q3, _AVX_MUL(z3, h3));
703 : #endif
704 1911552 : _AVX_STORE(&q[ldq*3], q1);
705 1911552 : _AVX_STORE(&q[(ldq*3)+offset], q2);
706 1911552 : _AVX_STORE(&q[(ldq*3)+2*offset], q3);
707 :
708 116604672 : for (i = 4; i < nb; i++)
709 : {
710 229386240 : h1 = _AVX_BROADCAST(&hh[i-3]);
711 :
712 229386240 : q1 = _AVX_LOAD(&q[i*ldq]);
713 229386240 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
714 229386240 : q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
715 :
716 : #ifdef __ELPA_USE_FMA__
717 114693120 : q1 = _AVX_NFMA(x1, h1, q1);
718 114693120 : q2 = _AVX_NFMA(x2, h1, q2);
719 114693120 : q3 = _AVX_NFMA(x3, h1, q3);
720 : #else
721 : q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
722 : q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
723 : q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
724 : #endif
725 :
726 229386240 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
727 : #ifdef __ELPA_USE_FMA__
728 114693120 : q1 = _AVX_NFMA(y1, h2, q1);
729 114693120 : q2 = _AVX_NFMA(y2, h2, q2);
730 114693120 : q3 = _AVX_NFMA(y3, h2, q3);
731 : #else
732 : q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
733 : q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
734 : q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
735 : #endif
736 :
737 229386240 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
738 : #ifdef __ELPA_USE_FMA__
739 114693120 : q1 = _AVX_NFMA(z1, h3, q1);
740 114693120 : q2 = _AVX_NFMA(z2, h3, q2);
741 114693120 : q3 = _AVX_NFMA(z3, h3, q3);
742 : #else
743 : q1 = _AVX_SUB(q1, _AVX_MUL(z1,h3));
744 : q2 = _AVX_SUB(q2, _AVX_MUL(z2,h3));
745 : q3 = _AVX_SUB(q3, _AVX_MUL(z3,h3));
746 : #endif
747 :
748 229386240 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
749 : #ifdef __ELPA_USE_FMA__
750 114693120 : q1 = _AVX_NFMA(w1, h4, q1);
751 114693120 : q2 = _AVX_NFMA(w2, h4, q2);
752 114693120 : q3 = _AVX_NFMA(w3, h4, q3);
753 : #else
754 : q1 = _AVX_SUB(q1, _AVX_MUL(w1,h4));
755 : q2 = _AVX_SUB(q2, _AVX_MUL(w2,h4));
756 : q3 = _AVX_SUB(q3, _AVX_MUL(w3,h4));
757 : #endif
758 :
759 114693120 : _AVX_STORE(&q[i*ldq],q1);
760 114693120 : _AVX_STORE(&q[(i*ldq)+offset],q2);
761 114693120 : _AVX_STORE(&q[(i*ldq)+2*offset],q3);
762 : }
763 :
764 3823104 : h1 = _AVX_BROADCAST(&hh[nb-3]);
765 3823104 : q1 = _AVX_LOAD(&q[nb*ldq]);
766 3823104 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
767 3823104 : q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
768 : #ifdef __ELPA_USE_FMA__
769 1911552 : q1 = _AVX_NFMA(x1, h1, q1);
770 1911552 : q2 = _AVX_NFMA(x2, h1, q2);
771 1911552 : q3 = _AVX_NFMA(x3, h1, q3);
772 : #else
773 : q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
774 : q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
775 : q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
776 : #endif
777 3823104 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
778 : #ifdef __ELPA_USE_FMA__
779 1911552 : q1 = _AVX_NFMA(y1, h2, q1);
780 1911552 : q2 = _AVX_NFMA(y2, h2, q2);
781 1911552 : q3 = _AVX_NFMA(y3, h2, q3);
782 : #else
783 : q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
784 : q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
785 : q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
786 : #endif
787 3823104 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
788 : #ifdef __ELPA_USE_FMA__
789 1911552 : q1 = _AVX_NFMA(z1, h3, q1);
790 1911552 : q2 = _AVX_NFMA(z2, h3, q2);
791 1911552 : q3 = _AVX_NFMA(z3, h3, q3);
792 : #else
793 : q1 = _AVX_SUB(q1, _AVX_MUL(z1,h3));
794 : q2 = _AVX_SUB(q2, _AVX_MUL(z2,h3));
795 : q3 = _AVX_SUB(q3, _AVX_MUL(z3,h3));
796 : #endif
797 1911552 : _AVX_STORE(&q[nb*ldq],q1);
798 1911552 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
799 1911552 : _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
800 :
801 3823104 : h1 = _AVX_BROADCAST(&hh[nb-2]);
802 3823104 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
803 3823104 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
804 3823104 : q3 = _AVX_LOAD(&q[((nb+1)*ldq)+2*offset]);
805 : #ifdef __ELPA_USE_FMA__
806 1911552 : q1 = _AVX_NFMA(x1, h1, q1);
807 1911552 : q2 = _AVX_NFMA(x2, h1, q2);
808 1911552 : q3 = _AVX_NFMA(x3, h1, q3);
809 : #else
810 : q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
811 : q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
812 : q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
813 : #endif
814 3823104 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
815 : #ifdef __ELPA_USE_FMA__
816 1911552 : q1 = _AVX_NFMA(y1, h2, q1);
817 1911552 : q2 = _AVX_NFMA(y2, h2, q2);
818 1911552 : q3 = _AVX_NFMA(y3, h2, q3);
819 : #else
820 : q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
821 : q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
822 : q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
823 : #endif
824 1911552 : _AVX_STORE(&q[(nb+1)*ldq],q1);
825 1911552 : _AVX_STORE(&q[((nb+1)*ldq)+offset],q2);
826 1911552 : _AVX_STORE(&q[((nb+1)*ldq)+2*offset],q3);
827 :
828 3823104 : h1 = _AVX_BROADCAST(&hh[nb-1]);
829 3823104 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
830 3823104 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
831 3823104 : q3 = _AVX_LOAD(&q[((nb+2)*ldq)+2*offset]);
832 : #ifdef __ELPA_USE_FMA__
833 1911552 : q1 = _AVX_NFMA(x1, h1, q1);
834 1911552 : q2 = _AVX_NFMA(x2, h1, q2);
835 1911552 : q3 = _AVX_NFMA(x3, h1, q3);
836 : #else
837 : q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
838 : q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
839 : q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
840 : #endif
841 1911552 : _AVX_STORE(&q[(nb+2)*ldq],q1);
842 301824 : _AVX_STORE(&q[((nb+2)*ldq)+offset],q2);
843 1911552 : _AVX_STORE(&q[((nb+2)*ldq)+2*offset],q3);
844 : }
845 :
846 : /**
847 : * Unrolled kernel that computes
848 : #ifdef DOUBLE_PRECISION_REAL
849 : * 8 rows of Q simultaneously, a
850 : #endif
851 : #ifdef SINGLE_PRECISION_REAL
852 : * 16 rows of Q simultaneously, a
853 : #endif
854 :
855 : * matrix Vector product with two householder
856 : * vectors + a rank 1 update is performed
857 : */
858 : #ifdef DOUBLE_PRECISION_REAL
859 : __forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
860 : #endif
861 : #ifdef SINGLE_PRECISION_REAL
862 : __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
863 : #endif
864 : {
865 : /////////////////////////////////////////////////////
866 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
867 : // hh contains four householder vectors
868 : /////////////////////////////////////////////////////
869 : int i;
870 :
871 0 : __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*3]);
872 0 : __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*2]);
873 0 : __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq]);
874 0 : __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[0]);
875 :
876 0 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
877 0 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
878 0 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
879 0 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
880 0 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
881 0 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
882 :
883 : #ifdef __ELPA_USE_FMA__
884 0 : __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
885 0 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
886 0 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
887 0 : __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
888 0 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
889 0 : __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
890 0 : __AVX_DATATYPE x1 = a1_1;
891 : #else
892 : __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
893 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
894 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
895 : __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
896 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
897 : __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
898 : __AVX_DATATYPE x1 = a1_1;
899 : #endif
900 :
901 0 : __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
902 0 : __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
903 0 : __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[ldq+offset]);
904 0 : __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[0+offset]);
905 :
906 : #ifdef __ELPA_USE_FMA__
907 0 : __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
908 0 : w2 = _AVX_FMA(a2_2, h_4_2, w2);
909 0 : w2 = _AVX_FMA(a1_2, h_4_1, w2);
910 0 : __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
911 0 : z2 = _AVX_FMA(a1_2, h_3_1, z2);
912 0 : __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
913 0 : __AVX_DATATYPE x2 = a1_2;
914 : #else
915 : __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
916 : w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
917 : w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
918 : __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
919 : z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
920 : __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
921 : __AVX_DATATYPE x2 = a1_2;
922 : #endif
923 :
924 : __AVX_DATATYPE q1;
925 : __AVX_DATATYPE q2;
926 :
927 : __AVX_DATATYPE h1;
928 : __AVX_DATATYPE h2;
929 : __AVX_DATATYPE h3;
930 : __AVX_DATATYPE h4;
931 :
932 0 : for(i = 4; i < nb; i++)
933 : {
934 0 : h1 = _AVX_BROADCAST(&hh[i-3]);
935 0 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
936 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
937 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
938 :
939 0 : q1 = _AVX_LOAD(&q[i*ldq]);
940 : #ifdef __ELPA_USE_FMA__
941 0 : x1 = _AVX_FMA(q1, h1, x1);
942 0 : y1 = _AVX_FMA(q1, h2, y1);
943 0 : z1 = _AVX_FMA(q1, h3, z1);
944 0 : w1 = _AVX_FMA(q1, h4, w1);
945 : #else
946 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
947 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
948 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
949 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
950 : #endif
951 :
952 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
953 : #ifdef __ELPA_USE_FMA__
954 0 : x2 = _AVX_FMA(q2, h1, x2);
955 0 : y2 = _AVX_FMA(q2, h2, y2);
956 0 : z2 = _AVX_FMA(q2, h3, z2);
957 0 : w2 = _AVX_FMA(q2, h4, w2);
958 : #else
959 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
960 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
961 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
962 : w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
963 : #endif
964 : }
965 :
966 0 : h1 = _AVX_BROADCAST(&hh[nb-3]);
967 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
968 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
969 :
970 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
971 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
972 :
973 : #ifdef __ELPA_USE_FMA__
974 0 : x1 = _AVX_FMA(q1, h1, x1);
975 0 : x2 = _AVX_FMA(q2, h1, x2);
976 0 : y1 = _AVX_FMA(q1, h2, y1);
977 0 : y2 = _AVX_FMA(q2, h2, y2);
978 0 : z1 = _AVX_FMA(q1, h3, z1);
979 0 : z2 = _AVX_FMA(q2, h3, z2);
980 : #else
981 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
982 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
983 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
984 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
985 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
986 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
987 : #endif
988 :
989 0 : h1 = _AVX_BROADCAST(&hh[nb-2]);
990 0 : h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
991 :
992 0 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
993 0 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
994 :
995 : #ifdef __ELPA_USE_FMA__
996 0 : x1 = _AVX_FMA(q1, h1, x1);
997 0 : x2 = _AVX_FMA(q2, h1, x2);
998 0 : y1 = _AVX_FMA(q1, h2, y1);
999 0 : y2 = _AVX_FMA(q2, h2, y2);
1000 : #else
1001 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1002 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1003 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1004 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
1005 : #endif
1006 :
1007 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1008 :
1009 0 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1010 0 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
1011 :
1012 : #ifdef __ELPA_USE_FMA__
1013 0 : x1 = _AVX_FMA(q1, h1, x1);
1014 0 : x2 = _AVX_FMA(q2, h1, x2);
1015 : #else
1016 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1017 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
1018 : #endif
1019 :
1020 : /////////////////////////////////////////////////////
1021 : // Rank-1 update of Q [8 x nb+3]
1022 : /////////////////////////////////////////////////////
1023 :
1024 0 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
1025 0 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1026 0 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
1027 0 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
1028 :
1029 0 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&s_1_2);
1030 0 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&s_1_3);
1031 0 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&s_2_3);
1032 0 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&s_1_4);
1033 0 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&s_2_4);
1034 0 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&s_3_4);
1035 :
1036 0 : h1 = tau1;
1037 0 : x1 = _AVX_MUL(x1, h1);
1038 0 : x2 = _AVX_MUL(x2, h1);
1039 :
1040 0 : h1 = tau2;
1041 0 : h2 = _AVX_MUL(h1, vs_1_2);
1042 : #ifdef __ELPA_USE_FMA__
1043 0 : y1 = _AVX_FMSUB(y1, h1, _AVX_MUL(x1,h2));
1044 0 : y2 = _AVX_FMSUB(y2, h1, _AVX_MUL(x2,h2));
1045 : #else
1046 : y1 = _AVX_SUB(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1047 : y2 = _AVX_SUB(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
1048 : #endif
1049 :
1050 0 : h1 = tau3;
1051 0 : h2 = _AVX_MUL(h1, vs_1_3);
1052 0 : h3 = _AVX_MUL(h1, vs_2_3);
1053 : #ifdef __ELPA_USE_FMA__
1054 0 : z1 = _AVX_FMSUB(z1, h1, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
1055 0 : z2 = _AVX_FMSUB(z2, h1, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
1056 : #else
1057 : z1 = _AVX_SUB(_AVX_MUL(z1,h1), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
1058 : z2 = _AVX_SUB(_AVX_MUL(z2,h1), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
1059 : #endif
1060 :
1061 0 : h1 = tau4;
1062 0 : h2 = _AVX_MUL(h1, vs_1_4);
1063 0 : h3 = _AVX_MUL(h1, vs_2_4);
1064 0 : h4 = _AVX_MUL(h1, vs_3_4);
1065 : #ifdef __ELPA_USE_FMA__
1066 0 : w1 = _AVX_FMSUB(w1, h1, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1067 0 : w2 = _AVX_FMSUB(w2, h1, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
1068 : #else
1069 : w1 = _AVX_SUB(_AVX_MUL(w1,h1), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1070 : w2 = _AVX_SUB(_AVX_MUL(w2,h1), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
1071 : #endif
1072 :
1073 0 : q1 = _AVX_LOAD(&q[0]);
1074 0 : q2 = _AVX_LOAD(&q[offset]);
1075 0 : q1 = _AVX_SUB(q1, w1);
1076 0 : q2 = _AVX_SUB(q2, w2);
1077 : _AVX_STORE(&q[0],q1);
1078 0 : _AVX_STORE(&q[offset],q2);
1079 :
1080 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1081 0 : q1 = _AVX_LOAD(&q[ldq]);
1082 0 : q2 = _AVX_LOAD(&q[ldq+offset]);
1083 : #ifdef __ELPA_USE_FMA__
1084 0 : q1 = _AVX_SUB(q1, _AVX_FMA(w1, h4, z1));
1085 0 : q2 = _AVX_SUB(q2, _AVX_FMA(w2, h4, z2));
1086 : #else
1087 : q1 = _AVX_SUB(q1, _AVX_ADD(z1, _AVX_MUL(w1, h4)));
1088 : q2 = _AVX_SUB(q2, _AVX_ADD(z2, _AVX_MUL(w2, h4)));
1089 : #endif
1090 0 : _AVX_STORE(&q[ldq],q1);
1091 0 : _AVX_STORE(&q[ldq+offset],q2);
1092 :
1093 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1094 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1095 0 : q1 = _AVX_LOAD(&q[ldq*2]);
1096 0 : q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
1097 : #ifdef __ELPA_USE_FMA__
1098 0 : q1 = _AVX_SUB(q1, y1);
1099 0 : q1 = _AVX_NFMA(z1, h3, q1);
1100 0 : q1 = _AVX_NFMA(w1, h4, q1);
1101 0 : q2 = _AVX_SUB(q2, y2);
1102 0 : q2 = _AVX_NFMA(z2, h3, q2);
1103 0 : q2 = _AVX_NFMA(w2, h4, q2);
1104 : #else
1105 : q1 = _AVX_SUB(q1, _AVX_ADD(y1, _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4))));
1106 : q2 = _AVX_SUB(q2, _AVX_ADD(y2, _AVX_ADD(_AVX_MUL(z2, h3), _AVX_MUL(w2, h4))));
1107 : #endif
1108 0 : _AVX_STORE(&q[ldq*2],q1);
1109 0 : _AVX_STORE(&q[(ldq*2)+offset],q2);
1110 :
1111 0 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1112 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1113 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1114 0 : q1 = _AVX_LOAD(&q[ldq*3]);
1115 0 : q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
1116 : #ifdef __ELPA_USE_FMA__
1117 0 : q1 = _AVX_SUB(q1, x1);
1118 0 : q1 = _AVX_NFMA(y1, h2, q1);
1119 0 : q1 = _AVX_NFMA(z1, h3, q1);
1120 0 : q1 = _AVX_NFMA(w1, h4, q1);
1121 0 : q2 = _AVX_SUB(q2, x2);
1122 0 : q2 = _AVX_NFMA(y2, h2, q2);
1123 0 : q2 = _AVX_NFMA(z2, h3, q2);
1124 0 : q2 = _AVX_NFMA(w2, h4, q2);
1125 : #else
1126 : q1 = _AVX_SUB(q1, _AVX_ADD(x1, _AVX_ADD(_AVX_MUL(y1, h2), _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4)))));
1127 : q2 = _AVX_SUB(q2, _AVX_ADD(x2, _AVX_ADD(_AVX_MUL(y2, h2), _AVX_ADD(_AVX_MUL(z2, h3), _AVX_MUL(w2, h4)))));
1128 : #endif
1129 0 : _AVX_STORE(&q[ldq*3], q1);
1130 0 : _AVX_STORE(&q[(ldq*3)+offset], q2);
1131 :
1132 0 : for (i = 4; i < nb; i++)
1133 : {
1134 0 : h1 = _AVX_BROADCAST(&hh[i-3]);
1135 0 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
1136 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
1137 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
1138 :
1139 : #ifdef __ELPA_USE_FMA__
1140 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1141 0 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1142 0 : q1 = _AVX_NFMA(x1, h1, q1);
1143 0 : q1 = _AVX_NFMA(y1, h2, q1);
1144 0 : q1 = _AVX_NFMA(z1, h3, q1);
1145 0 : q1 = _AVX_NFMA(w1, h4, q1);
1146 0 : q2 = _AVX_NFMA(x2, h1, q2);
1147 0 : q2 = _AVX_NFMA(y2, h2, q2);
1148 0 : q2 = _AVX_NFMA(z2, h3, q2);
1149 0 : q2 = _AVX_NFMA(w2, h4, q2);
1150 0 : _AVX_STORE(&q[i*ldq],q1);
1151 0 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1152 : #else
1153 : q1 = _AVX_LOAD(&q[i*ldq]);
1154 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(w1, h4), _AVX_MUL(z1, h3)), _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2))));
1155 : _AVX_STORE(&q[i*ldq],q1);
1156 :
1157 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
1158 : q2 = _AVX_SUB(q2, _AVX_ADD(_AVX_ADD(_AVX_MUL(w2, h4), _AVX_MUL(z2, h3)), _AVX_ADD(_AVX_MUL(x2,h1), _AVX_MUL(y2, h2))));
1159 : _AVX_STORE(&q[(i*ldq)+offset],q2);
1160 : #endif
1161 : }
1162 :
1163 0 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1164 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1165 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1166 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1167 0 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
1168 : #ifdef __ELPA_USE_FMA__
1169 0 : q1 = _AVX_NFMA(x1, h1, q1);
1170 0 : q1 = _AVX_NFMA(y1, h2, q1);
1171 0 : q1 = _AVX_NFMA(z1, h3, q1);
1172 0 : q2 = _AVX_NFMA(x2, h1, q2);
1173 0 : q2 = _AVX_NFMA(y2, h2, q2);
1174 0 : q2 = _AVX_NFMA(z2, h3, q2);
1175 : #else
1176 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(y1, h2)) , _AVX_MUL(x1, h1)));
1177 : q2 = _AVX_SUB(q2, _AVX_ADD(_AVX_ADD(_AVX_MUL(z2, h3), _AVX_MUL(y2, h2)) , _AVX_MUL(x2, h1)));
1178 : #endif
1179 0 : _AVX_STORE(&q[nb*ldq],q1);
1180 0 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1181 :
1182 0 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1183 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1184 0 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1185 0 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
1186 : #ifdef __ELPA_USE_FMA__
1187 0 : q1 = _AVX_NFMA(x1, h1, q1);
1188 0 : q1 = _AVX_NFMA(y1, h2, q1);
1189 0 : q2 = _AVX_NFMA(x2, h1, q2);
1190 0 : q2 = _AVX_NFMA(y2, h2, q2);
1191 : #else
1192 : q1 = _AVX_SUB(q1, _AVX_ADD( _AVX_MUL(y1, h2) , _AVX_MUL(x1, h1)));
1193 : q2 = _AVX_SUB(q2, _AVX_ADD( _AVX_MUL(y2, h2) , _AVX_MUL(x2, h1)));
1194 : #endif
1195 0 : _AVX_STORE(&q[(nb+1)*ldq],q1);
1196 0 : _AVX_STORE(&q[((nb+1)*ldq)+offset],q2);
1197 :
1198 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1199 0 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1200 0 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
1201 : #ifdef __ELPA_USE_FMA__
1202 0 : q1 = _AVX_NFMA(x1, h1, q1);
1203 0 : q2 = _AVX_NFMA(x2, h1, q2);
1204 : #else
1205 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1206 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
1207 : #endif
1208 0 : _AVX_STORE(&q[(nb+2)*ldq],q1);
1209 0 : _AVX_STORE(&q[((nb+2)*ldq)+offset],q2);
1210 : }
1211 :
1212 : /**
1213 : * Unrolled kernel that computes
1214 : #ifdef DOUBLE_PRECISION_REAL
1215 : * 4 rows of Q simultaneously, a
1216 : #endif
1217 : #ifdef SINGLE_PRECISION_REAL
1218 : * 8 rows of Q simultaneously, a
1219 : #endif
1220 : * matrix Vector product with two householder
1221 : * vectors + a rank 1 update is performed
1222 : */
1223 : #ifdef DOUBLE_PRECISION_REAL
1224 : __forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
1225 : #endif
1226 : #ifdef SINGLE_PRECISION_REAL
1227 : __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
1228 : #endif
1229 : {
1230 : /////////////////////////////////////////////////////
1231 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
1232 : // hh contains four householder vectors
1233 : /////////////////////////////////////////////////////
1234 : int i;
1235 :
1236 1274368 : __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*3]);
1237 1274368 : __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*2]);
1238 1274368 : __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq]);
1239 637184 : __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[0]);
1240 :
1241 1274368 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
1242 1274368 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1243 1274368 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1244 1274368 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1245 1274368 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1246 1274368 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1247 :
1248 : #ifdef __ELPA_USE_FMA__
1249 637184 : __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
1250 637184 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
1251 637184 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
1252 637184 : __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
1253 637184 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
1254 637184 : __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
1255 637184 : __AVX_DATATYPE x1 = a1_1;
1256 : #else
1257 : __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
1258 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
1259 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
1260 : __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
1261 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
1262 : __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
1263 : __AVX_DATATYPE x1 = a1_1;
1264 : #endif
1265 :
1266 : __AVX_DATATYPE q1;
1267 :
1268 : __AVX_DATATYPE h1;
1269 : __AVX_DATATYPE h2;
1270 : __AVX_DATATYPE h3;
1271 : __AVX_DATATYPE h4;
1272 :
1273 38868224 : for(i = 4; i < nb; i++)
1274 : {
1275 76462080 : h1 = _AVX_BROADCAST(&hh[i-3]);
1276 76462080 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
1277 76462080 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
1278 76462080 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
1279 :
1280 76462080 : q1 = _AVX_LOAD(&q[i*ldq]);
1281 : #ifdef __ELPA_USE_FMA__
1282 38231040 : x1 = _AVX_FMA(q1, h1, x1);
1283 38231040 : y1 = _AVX_FMA(q1, h2, y1);
1284 38231040 : z1 = _AVX_FMA(q1, h3, z1);
1285 38231040 : w1 = _AVX_FMA(q1, h4, w1);
1286 : #else
1287 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1288 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1289 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1290 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1291 : #endif
1292 : }
1293 :
1294 1274368 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1295 1274368 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1296 1274368 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1297 1274368 : q1 = _AVX_LOAD(&q[nb*ldq]);
1298 : #ifdef _FMA4__
1299 : x1 = _AVX_FMA(q1, h1, x1);
1300 : y1 = _AVX_FMA(q1, h2, y1);
1301 : z1 = _AVX_FMA(q1, h3, z1);
1302 : #else
1303 1274368 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1304 1274368 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1305 1274368 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1306 : #endif
1307 :
1308 1274368 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1309 1274368 : h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
1310 1274368 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1311 : #ifdef __ELPA_USE_FMA__
1312 637184 : x1 = _AVX_FMA(q1, h1, x1);
1313 637184 : y1 = _AVX_FMA(q1, h2, y1);
1314 : #else
1315 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1316 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1317 : #endif
1318 :
1319 1274368 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1320 1274368 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1321 : #ifdef __ELPA_USE_FMA__
1322 637184 : x1 = _AVX_FMA(q1, h1, x1);
1323 : #else
1324 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1325 : #endif
1326 :
1327 : /////////////////////////////////////////////////////
1328 : // Rank-1 update of Q [4 x nb+3]
1329 : /////////////////////////////////////////////////////
1330 :
1331 637184 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
1332 1274368 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1333 1274368 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
1334 1274368 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
1335 :
1336 637184 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&s_1_2);
1337 637184 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&s_1_3);
1338 637184 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&s_2_3);
1339 637184 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&s_1_4);
1340 637184 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&s_2_4);
1341 637184 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&s_3_4);
1342 :
1343 637184 : h1 = tau1;
1344 637184 : x1 = _AVX_MUL(x1, h1);
1345 :
1346 637184 : h1 = tau2;
1347 637184 : h2 = _AVX_MUL(h1, vs_1_2);
1348 : #ifdef __ELPA_USE_FMA__
1349 1274368 : y1 = _AVX_FMSUB(y1, h1, _AVX_MUL(x1,h2));
1350 : #else
1351 : y1 = _AVX_SUB(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1352 : #endif
1353 :
1354 637184 : h1 = tau3;
1355 637184 : h2 = _AVX_MUL(h1, vs_1_3);
1356 637184 : h3 = _AVX_MUL(h1, vs_2_3);
1357 : #ifdef __ELPA_USE_FMA__
1358 1911552 : z1 = _AVX_FMSUB(z1, h1, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
1359 : #else
1360 : z1 = _AVX_SUB(_AVX_MUL(z1,h1), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
1361 : #endif
1362 :
1363 637184 : h1 = tau4;
1364 637184 : h2 = _AVX_MUL(h1, vs_1_4);
1365 637184 : h3 = _AVX_MUL(h1, vs_2_4);
1366 637184 : h4 = _AVX_MUL(h1, vs_3_4);
1367 : #ifdef __ELPA_USE_FMA__
1368 2548736 : w1 = _AVX_FMSUB(w1, h1, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1369 : #else
1370 : w1 = _AVX_SUB(_AVX_MUL(w1,h1), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1371 : #endif
1372 :
1373 637184 : q1 = _AVX_LOAD(&q[0]);
1374 637184 : q1 = _AVX_SUB(q1, w1);
1375 : _AVX_STORE(&q[0],q1);
1376 :
1377 1274368 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1378 1274368 : q1 = _AVX_LOAD(&q[ldq]);
1379 : #ifdef __ELPA_USE_FMA__
1380 1274368 : q1 = _AVX_SUB(q1, _AVX_FMA(w1, h4, z1));
1381 : #else
1382 : q1 = _AVX_SUB(q1, _AVX_ADD(z1, _AVX_MUL(w1, h4)));
1383 : #endif
1384 637184 : _AVX_STORE(&q[ldq],q1);
1385 :
1386 1274368 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1387 1274368 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1388 1274368 : q1 = _AVX_LOAD(&q[ldq*2]);
1389 : #ifdef __ELPA_USE_FMA__
1390 637184 : q1 = _AVX_SUB(q1, y1);
1391 637184 : q1 = _AVX_NFMA(z1, h3, q1);
1392 637184 : q1 = _AVX_NFMA(w1, h4, q1);
1393 : #else
1394 : q1 = _AVX_SUB(q1, _AVX_ADD(y1, _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4))));
1395 : #endif
1396 637184 : _AVX_STORE(&q[ldq*2],q1);
1397 :
1398 1274368 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1399 1274368 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1400 1274368 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1401 1274368 : q1 = _AVX_LOAD(&q[ldq*3]);
1402 : #ifdef __ELPA_USE_FMA__
1403 637184 : q1 = _AVX_SUB(q1, x1);
1404 637184 : q1 = _AVX_NFMA(y1, h2, q1);
1405 637184 : q1 = _AVX_NFMA(z1, h3, q1);
1406 637184 : q1 = _AVX_NFMA(w1, h4, q1);
1407 : #else
1408 : q1 = _AVX_SUB(q1, _AVX_ADD(x1, _AVX_ADD(_AVX_MUL(y1, h2), _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4)))));
1409 : #endif
1410 637184 : _AVX_STORE(&q[ldq*3], q1);
1411 :
1412 38868224 : for (i = 4; i < nb; i++)
1413 : {
1414 76462080 : h1 = _AVX_BROADCAST(&hh[i-3]);
1415 76462080 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
1416 76462080 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
1417 76462080 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
1418 :
1419 76462080 : q1 = _AVX_LOAD(&q[i*ldq]);
1420 : #ifdef __ELPA_USE_FMA__
1421 38231040 : q1 = _AVX_NFMA(x1, h1, q1);
1422 38231040 : q1 = _AVX_NFMA(y1, h2, q1);
1423 38231040 : q1 = _AVX_NFMA(z1, h3, q1);
1424 38231040 : q1 = _AVX_NFMA(w1, h4, q1);
1425 : #else
1426 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(w1, h4), _AVX_MUL(z1, h3)), _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2))));
1427 : #endif
1428 38231040 : _AVX_STORE(&q[i*ldq],q1);
1429 : }
1430 :
1431 1274368 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1432 1274368 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1433 1274368 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1434 1274368 : q1 = _AVX_LOAD(&q[nb*ldq]);
1435 : #ifdef __ELPA_USE_FMA__
1436 637184 : q1 = _AVX_NFMA(x1, h1, q1);
1437 637184 : q1 = _AVX_NFMA(y1, h2, q1);
1438 637184 : q1 = _AVX_NFMA(z1, h3, q1);
1439 : #else
1440 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(y1, h2)) , _AVX_MUL(x1, h1)));
1441 : #endif
1442 637184 : _AVX_STORE(&q[nb*ldq],q1);
1443 :
1444 1274368 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1445 1274368 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1446 1274368 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1447 : #ifdef __ELPA_USE_FMA__
1448 637184 : q1 = _AVX_NFMA(x1, h1, q1);
1449 637184 : q1 = _AVX_NFMA(y1, h2, q1);
1450 : #else
1451 : q1 = _AVX_SUB(q1, _AVX_ADD( _AVX_MUL(y1, h2) , _AVX_MUL(x1, h1)));
1452 : #endif
1453 637184 : _AVX_STORE(&q[(nb+1)*ldq],q1);
1454 :
1455 1274368 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1456 1274368 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1457 : #ifdef __ELPA_USE_FMA__
1458 637184 : q1 = _AVX_NFMA(x1, h1, q1);
1459 : #else
1460 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1461 : #endif
1462 637184 : _AVX_STORE(&q[(nb+2)*ldq],q1);
1463 : }
1464 :
1465 :
1466 : #if 0
1467 : #ifdef SINGLE_PRECISION_REAL
1468 : /**
1469 : * Unrolled kernel that computes
1470 : * 4 rows of Q simultaneously, a
1471 : * matrix Vector product with two householder
1472 : * vectors + a rank 1 update is performed
1473 : */
1474 : __forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
1475 : {
1476 : /////////////////////////////////////////////////////
1477 : // Matrix Vector Multiplication, Q [4 x nb+3] * hh
1478 : // hh contains four householder vectors
1479 : /////////////////////////////////////////////////////
1480 : int i;
1481 :
1482 : __AVX_DATATYPE a1_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3]));
1483 : __AVX_DATATYPE a2_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2]));
1484 : __AVX_DATATYPE a3_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq]));
1485 : __AVX_DATATYPE a4_1 = _mm256_castps128_ps256(_mm_load_ps(&q[0])); // q(1,1) | q(2,1) | q(3,1) | q(4,1) |
1486 :
1487 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
1488 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1489 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1490 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1491 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1492 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1493 :
1494 : #ifdef __ELPA_USE_FMA__
1495 : __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
1496 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
1497 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
1498 : __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
1499 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
1500 : __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
1501 : __AVX_DATATYPE x1 = a1_1;
1502 : #else
1503 : __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
1504 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
1505 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
1506 : __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
1507 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
1508 : __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
1509 : __AVX_DATATYPE x1 = a1_1;
1510 : #endif
1511 :
1512 : __AVX_DATATYPE q1;
1513 :
1514 : __AVX_DATATYPE h1;
1515 : __AVX_DATATYPE h2;
1516 : __AVX_DATATYPE h3;
1517 : __AVX_DATATYPE h4;
1518 :
1519 : for(i = 4; i < nb; i++)
1520 : {
1521 : h1 = _AVX_BROADCAST(&hh[i-3]);
1522 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
1523 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
1524 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
1525 :
1526 : q1 = _AVX_LOAD(&q[i*ldq]);
1527 : #ifdef __ELPA_USE_FMA__
1528 : x1 = _AVX_FMA(q1, h1, x1);
1529 : y1 = _AVX_FMA(q1, h2, y1);
1530 : z1 = _AVX_FMA(q1, h3, z1);
1531 : w1 = _AVX_FMA(q1, h4, w1);
1532 : #else
1533 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1534 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1535 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1536 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1537 : #endif
1538 : }
1539 :
1540 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1541 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1542 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1543 : q1 = _AVX_LOAD(&q[nb*ldq]);
1544 : #ifdef _FMA4__
1545 : x1 = _AVX_FMA(q1, h1, x1);
1546 : y1 = _AVX_FMA(q1, h2, y1);
1547 : z1 = _AVX_FMA(q1, h3, z1);
1548 : #else
1549 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1550 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1551 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1552 : #endif
1553 :
1554 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1555 : h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
1556 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1557 : #ifdef __ELPA_USE_FMA__
1558 : x1 = _AVX_FMA(q1, h1, x1);
1559 : y1 = _AVX_FMA(q1, h2, y1);
1560 : #else
1561 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1562 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1563 : #endif
1564 :
1565 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1566 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1567 : #ifdef __ELPA_USE_FMA__
1568 : x1 = _AVX_FMA(q1, h1, x1);
1569 : #else
1570 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1571 : #endif
1572 :
1573 : /////////////////////////////////////////////////////
1574 : // Rank-1 update of Q [4 x nb+3]
1575 : /////////////////////////////////////////////////////
1576 :
1577 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
1578 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1579 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
1580 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
1581 :
1582 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&s_1_2);
1583 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&s_1_3);
1584 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&s_2_3);
1585 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&s_1_4);
1586 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&s_2_4);
1587 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&s_3_4);
1588 :
1589 : h1 = tau1;
1590 : x1 = _AVX_MUL(x1, h1);
1591 :
1592 : h1 = tau2;
1593 : h2 = _AVX_MUL(h1, vs_1_2);
1594 : #ifdef __ELPA_USE_FMA__
1595 : y1 = _AVX_FMSUB(y1, h1, _AVX_MUL(x1,h2));
1596 : #else
1597 : y1 = _AVX_SUB(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
1598 : #endif
1599 :
1600 : h1 = tau3;
1601 : h2 = _AVX_MUL(h1, vs_1_3);
1602 : h3 = _AVX_MUL(h1, vs_2_3);
1603 : #ifdef __ELPA_USE_FMA__
1604 : z1 = _AVX_FMSUB(z1, h1, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
1605 : #else
1606 : z1 = _AVX_SUB(_AVX_MUL(z1,h1), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
1607 : #endif
1608 :
1609 : h1 = tau4;
1610 : h2 = _AVX_MUL(h1, vs_1_4);
1611 : h3 = _AVX_MUL(h1, vs_2_4);
1612 : h4 = _AVX_MUL(h1, vs_3_4);
1613 : #ifdef __ELPA_USE_FMA__
1614 : w1 = _AVX_FMSUB(w1, h1, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1615 : #else
1616 : w1 = _AVX_SUB(_AVX_MUL(w1,h1), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1617 : #endif
1618 :
1619 : q1 = _AVX_LOAD(&q[0]);
1620 : q1 = _AVX_SUB(q1, w1);
1621 : _mm_store_ps(&q[0], _mm256_castps256_ps128(q1));
1622 : // _AVX_STORE(&q[0],q1);
1623 :
1624 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1625 : q1 = _AVX_LOAD(&q[ldq]);
1626 : #ifdef __ELPA_USE_FMA__
1627 : q1 = _AVX_SUB(q1, _AVX_FMA(w1, h4, z1));
1628 : #else
1629 : q1 = _AVX_SUB(q1, _AVX_ADD(z1, _AVX_MUL(w1, h4)));
1630 : #endif
1631 : _mm_store_ps(&q[ldq], _mm256_castps256_ps128(q1));
1632 : // _AVX_STORE(&q[ldq],q1);
1633 :
1634 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1635 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1636 : q1 = _AVX_LOAD(&q[ldq*2]);
1637 : #ifdef __ELPA_USE_FMA__
1638 : q1 = _AVX_SUB(q1, y1);
1639 : q1 = _AVX_NFMA(z1, h3, q1);
1640 : q1 = _AVX_NFMA(w1, h4, q1);
1641 : #else
1642 : q1 = _AVX_SUB(q1, _AVX_ADD(y1, _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4))));
1643 : #endif
1644 : _mm_store_ps(&q[ldq*2], _mm256_castps256_ps128(q1));
1645 : // _AVX_STORE(&q[ldq*2],q1);
1646 :
1647 : h2 = _AVX_BROADCAST(&hh[ldh+1]);
1648 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1649 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1650 : q1 = _AVX_LOAD(&q[ldq*3]);
1651 : #ifdef __ELPA_USE_FMA__
1652 : q1 = _AVX_SUB(q1, x1);
1653 : q1 = _AVX_NFMA(y1, h2, q1);
1654 : q1 = _AVX_NFMA(z1, h3, q1);
1655 : q1 = _AVX_NFMA(w1, h4, q1);
1656 : #else
1657 : q1 = _AVX_SUB(q1, _AVX_ADD(x1, _AVX_ADD(_AVX_MUL(y1, h2), _AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(w1, h4)))));
1658 : #endif
1659 : _mm_store_ps(&q[ldq*3], _mm256_castps256_ps128(q1));
1660 : // _AVX_STORE(&q[ldq*3], q1);
1661 :
1662 : for (i = 4; i < nb; i++)
1663 : {
1664 : h1 = _AVX_BROADCAST(&hh[i-3]);
1665 : h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
1666 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
1667 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
1668 :
1669 : q1 = _AVX_LOAD(&q[i*ldq]);
1670 : #ifdef __ELPA_USE_FMA__
1671 : q1 = _AVX_NFMA(x1, h1, q1);
1672 : q1 = _AVX_NFMA(y1, h2, q1);
1673 : q1 = _AVX_NFMA(z1, h3, q1);
1674 : q1 = _AVX_NFMA(w1, h4, q1);
1675 : #else
1676 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(w1, h4), _AVX_MUL(z1, h3)), _AVX_ADD(_AVX_MUL(x1,h1), _AVX_MUL(y1, h2))));
1677 : #endif
1678 : _mm_store_ps(&q[ldq*i], _mm256_castps256_ps128(q1));
1679 : // _AVX_STORE(&q[i*ldq],q1);
1680 : }
1681 :
1682 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1683 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1684 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1685 : q1 = _AVX_LOAD(&q[nb*ldq]);
1686 : #ifdef __ELPA_USE_FMA__
1687 : q1 = _AVX_NFMA(x1, h1, q1);
1688 : q1 = _AVX_NFMA(y1, h2, q1);
1689 : q1 = _AVX_NFMA(z1, h3, q1);
1690 : #else
1691 : q1 = _AVX_SUB(q1, _AVX_ADD(_AVX_ADD(_AVX_MUL(z1, h3), _AVX_MUL(y1, h2)) , _AVX_MUL(x1, h1)));
1692 : #endif
1693 : _mm_store_ps(&q[ldq*nb], _mm256_castps256_ps128(q1));
1694 : // _AVX_STORE(&q[nb*ldq],q1);
1695 :
1696 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1697 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1698 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1699 : #ifdef __ELPA_USE_FMA__
1700 : q1 = _AVX_NFMA(x1, h1, q1);
1701 : q1 = _AVX_NFMA(y1, h2, q1);
1702 : #else
1703 : q1 = _AVX_SUB(q1, _AVX_ADD( _AVX_MUL(y1, h2) , _AVX_MUL(x1, h1)));
1704 : #endif
1705 : _mm_store_ps(&q[ldq*(nb+1)], _mm256_castps256_ps128(q1));
1706 : // _AVX_STORE(&q[(nb+1)*ldq],q1);
1707 :
1708 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1709 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1710 : #ifdef __ELPA_USE_FMA__
1711 : q1 = _AVX_NFMA(x1, h1, q1);
1712 : #else
1713 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1714 : #endif
1715 : _mm_store_ps(&q[ldq*(nb+2)], _mm256_castps256_ps128(q1));
1716 : // _AVX_STORE(&q[(nb+2)*ldq],q1);
1717 : }
1718 : #endif /* SINGLE_PRECISION_REAL */
1719 : #endif
|