Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 :
63 : #include "config-f90.h"
64 :
65 : #include <x86intrin.h>
66 : #include <stdio.h>
67 : #include <stdlib.h>
68 :
69 : #define __forceinline __attribute__((always_inline)) static
70 :
71 : #ifdef DOUBLE_PRECISION_REAL
72 : #define offset 4
73 : #define __AVX_DATATYPE __m256d
74 : #define _AVX_LOAD _mm256_load_pd
75 : #define _AVX_STORE _mm256_store_pd
76 : #define _AVX_ADD _mm256_add_pd
77 : #define _AVX_SUB _mm256_sub_pd
78 : #define _AVX_MUL _mm256_mul_pd
79 : #define _AVX_BROADCAST _mm256_broadcast_sd
80 :
81 : #ifdef HAVE_AVX2
82 :
83 : #ifdef __FMA4__
84 : #define __ELPA_USE_FMA__
85 : #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
86 : #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
87 : #define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
88 : #endif
89 :
90 : #ifdef __AVX2__
91 : #define __ELPA_USE_FMA__
92 : #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
93 : #define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
94 : #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
95 : #endif
96 :
97 : #endif
98 :
99 : #define _AVX_FMA _mm256_FMA_pd
100 : #define _AVX_NFMA _mm256_NFMA_pd
101 : #define _AVX_FMSUB _mm256_FMSUB_pd
102 : #endif /* DOUBLE_PRECISION_REAL */
103 :
104 : #ifdef SINGLE_PRECISION_REAL
105 : #define offset 8
106 : #define __AVX_DATATYPE __m256
107 : #define _AVX_LOAD _mm256_load_ps
108 : #define _AVX_STORE _mm256_store_ps
109 : #define _AVX_ADD _mm256_add_ps
110 : #define _AVX_SUB _mm256_sub_ps
111 : #define _AVX_MUL _mm256_mul_ps
112 : #define _AVX_BROADCAST _mm256_broadcast_ss
113 :
114 : #ifdef HAVE_AVX2
115 :
116 : #ifdef __FMA4__
117 : #define __ELPA_USE_FMA__
118 : #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
119 : #define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
120 : #define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
121 : #endif
122 :
123 : #ifdef __AVX2__
124 : #define __ELPA_USE_FMA__
125 : #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
126 : #define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
127 : #define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
128 : #endif
129 :
130 : #endif
131 :
132 : #define _AVX_FMA _mm256_FMA_ps
133 : #define _AVX_NFMA _mm256_NFMA_ps
134 : #define _AVX_FMSUB _mm256_FMSUB_ps
135 : #endif /* SINGLE_PRECISION_REAL */
136 :
137 : #ifdef DOUBLE_PRECISION_REAL
138 : //Forward declaration
139 : static void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
140 : static void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
141 :
142 : void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
143 : #endif
144 :
145 : #ifdef SINGLE_PRECISION_REAL
146 : //Forward declaration
147 : static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
148 : static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
149 :
150 : void hexa_hh_trafo_real_avx_avx2_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
151 :
152 : #endif
153 :
154 : #ifdef DOUBLE_PRECISION_REAL
155 : /*
156 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
157 : !f> interface
158 : !f> subroutine hexa_hh_trafo_real_avx_avx2_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
159 : !f> bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_double")
160 : !f> use, intrinsic :: iso_c_binding
161 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
162 : !f> type(c_ptr), value :: q
163 : !f> real(kind=c_double) :: hh(pnb,6)
164 : !f> end subroutine
165 : !f> end interface
166 : !f>#endif
167 : */
168 : #endif
169 : #ifdef SINGLE_PRECISION_REAL
170 : /*
171 : !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
172 : !f> interface
173 : !f> subroutine hexa_hh_trafo_real_avx_avx2_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
174 : !f> bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_single")
175 : !f> use, intrinsic :: iso_c_binding
176 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
177 : !f> type(c_ptr), value :: q
178 : !f> real(kind=c_float) :: hh(pnb,6)
179 : !f> end subroutine
180 : !f> end interface
181 : !f>#endif
182 : */
183 : #endif
184 :
185 : #ifdef DOUBLE_PRECISION_REAL
186 327680 : void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
187 : #endif
188 : #ifdef SINGLE_PRECISION_REAL
189 61440 : void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
190 : #endif
191 : {
192 : int i;
193 389120 : int nb = *pnb;
194 389120 : int nq = *pldq;
195 389120 : int ldq = *pldq;
196 389120 : int ldh = *pldh;
197 : int worked_on;
198 :
199 389120 : worked_on = 0;
200 :
201 : // calculating scalar products to compute
202 : // 6 householder vectors simultaneously
203 : #ifdef DOUBLE_PRECISION_REAL
204 : double scalarprods[15];
205 : #endif
206 : #ifdef SINGLE_PRECISION_REAL
207 : float scalarprods[15];
208 : #endif
209 :
210 389120 : scalarprods[0] = hh[(ldh+1)];
211 389120 : scalarprods[1] = hh[(ldh*2)+2];
212 389120 : scalarprods[2] = hh[(ldh*2)+1];
213 389120 : scalarprods[3] = hh[(ldh*3)+3];
214 389120 : scalarprods[4] = hh[(ldh*3)+2];
215 389120 : scalarprods[5] = hh[(ldh*3)+1];
216 389120 : scalarprods[6] = hh[(ldh*4)+4];
217 389120 : scalarprods[7] = hh[(ldh*4)+3];
218 389120 : scalarprods[8] = hh[(ldh*4)+2];
219 389120 : scalarprods[9] = hh[(ldh*4)+1];
220 389120 : scalarprods[10] = hh[(ldh*5)+5];
221 389120 : scalarprods[11] = hh[(ldh*5)+4];
222 389120 : scalarprods[12] = hh[(ldh*5)+3];
223 389120 : scalarprods[13] = hh[(ldh*5)+2];
224 389120 : scalarprods[14] = hh[(ldh*5)+1];
225 :
226 : // calculate scalar product of first and fourth householder Vector
227 : // loop counter = 2
228 389120 : scalarprods[0] += hh[1] * hh[(2+ldh)];
229 389120 : scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
230 389120 : scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
231 389120 : scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
232 389120 : scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
233 :
234 : // loop counter = 3
235 389120 : scalarprods[0] += hh[2] * hh[(3+ldh)];
236 389120 : scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
237 389120 : scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
238 389120 : scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
239 389120 : scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
240 :
241 389120 : scalarprods[1] += hh[1] * hh[3+(ldh*2)];
242 389120 : scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
243 389120 : scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
244 389120 : scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
245 :
246 : // loop counter = 4
247 389120 : scalarprods[0] += hh[3] * hh[(4+ldh)];
248 389120 : scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
249 389120 : scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
250 389120 : scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
251 389120 : scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
252 :
253 389120 : scalarprods[1] += hh[2] * hh[4+(ldh*2)];
254 389120 : scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
255 389120 : scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
256 389120 : scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
257 :
258 389120 : scalarprods[3] += hh[1] * hh[4+(ldh*3)];
259 389120 : scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
260 389120 : scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
261 :
262 : // loop counter = 5
263 389120 : scalarprods[0] += hh[4] * hh[(5+ldh)];
264 389120 : scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
265 389120 : scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
266 389120 : scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
267 389120 : scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
268 :
269 389120 : scalarprods[1] += hh[3] * hh[5+(ldh*2)];
270 389120 : scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
271 389120 : scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
272 389120 : scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
273 :
274 389120 : scalarprods[3] += hh[2] * hh[5+(ldh*3)];
275 389120 : scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
276 389120 : scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
277 :
278 389120 : scalarprods[6] += hh[1] * hh[5+(ldh*4)];
279 389120 : scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
280 :
281 : #pragma ivdep
282 22958080 : for (i = 6; i < nb; i++)
283 : {
284 22568960 : scalarprods[0] += hh[i-1] * hh[(i+ldh)];
285 22568960 : scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
286 22568960 : scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
287 22568960 : scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
288 22568960 : scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
289 :
290 22568960 : scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
291 22568960 : scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
292 22568960 : scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
293 22568960 : scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
294 :
295 22568960 : scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
296 22568960 : scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
297 22568960 : scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
298 :
299 22568960 : scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
300 22568960 : scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
301 :
302 22568960 : scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
303 : }
304 :
305 : // Production level kernel calls with padding
306 : #ifdef DOUBLE_PRECISION_REAL
307 1966080 : for (i = 0; i < nq-4; i+=8)
308 : {
309 1638400 : hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
310 1638400 : worked_on += 8;
311 : }
312 : #endif
313 : #ifdef SINGLE_PRECISION_REAL
314 368640 : for (i = 0; i < nq-8; i+=16)
315 : {
316 307200 : hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
317 307200 : worked_on += 16;
318 : }
319 : #endif
320 389120 : if (nq == i)
321 : {
322 389120 : return;
323 : }
324 : #ifdef DOUBLE_PRECISION_REAL
325 0 : if (nq-i == 4)
326 : {
327 0 : hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
328 0 : worked_on += 4;
329 : }
330 : #endif
331 : #ifdef SINGLE_PRECISION_REAL
332 0 : if (nq-i == 8)
333 : {
334 0 : hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
335 0 : worked_on += 8;
336 : }
337 : #endif
338 : #ifdef WITH_DEBUG
339 : if (worked_on != nq)
340 : {
341 : printf("Error in real AVX/AVX2 BLOCK6 kernel \n");
342 : abort();
343 : }
344 : #endif
345 : }
346 :
347 : /**
348 : * Unrolled kernel that computes
349 : #ifdef DOUBLE_PRECISION_REAL
350 : * 8 rows of Q simultaneously, a
351 : #endif
352 : #ifdef SINGLE_PRECISION_REAL
353 : * 16 rows of Q simultaneously, a
354 : #endif
355 : * matrix Vector product with two householder
356 : * vectors + a rank 1 update is performed
357 : */
358 : #ifdef DOUBLE_PRECISION_REAL
359 : __forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
360 : #endif
361 : #ifdef SINGLE_PRECISION_REAL
362 : __forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
363 : #endif
364 : {
365 : /////////////////////////////////////////////////////
366 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
367 : // hh contains four householder vectors
368 : /////////////////////////////////////////////////////
369 : int i;
370 :
371 3891200 : __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*5]);
372 3891200 : __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*4]);
373 3891200 : __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq*3]);
374 3891200 : __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[ldq*2]);
375 3891200 : __AVX_DATATYPE a5_1 = _AVX_LOAD(&q[ldq]);
376 1945600 : __AVX_DATATYPE a6_1 = _AVX_LOAD(&q[0]);
377 :
378 3891200 : __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
379 3891200 : __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
380 3891200 : __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
381 3891200 : __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
382 3891200 : __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
383 : #ifdef __ELPA_USE_FMA__
384 1945600 : register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
385 1945600 : t1 = _AVX_FMA(a4_1, h_6_4, t1);
386 1945600 : t1 = _AVX_FMA(a3_1, h_6_3, t1);
387 1945600 : t1 = _AVX_FMA(a2_1, h_6_2, t1);
388 1945600 : t1 = _AVX_FMA(a1_1, h_6_1, t1);
389 : #else
390 : register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
391 : t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
392 : t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
393 : t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
394 : t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
395 : #endif
396 3891200 : __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
397 3891200 : __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
398 3891200 : __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
399 3891200 : __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
400 : #ifdef __ELPA_USE_FMA__
401 1945600 : register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
402 1945600 : v1 = _AVX_FMA(a3_1, h_5_3, v1);
403 1945600 : v1 = _AVX_FMA(a2_1, h_5_2, v1);
404 1945600 : v1 = _AVX_FMA(a1_1, h_5_1, v1);
405 : #else
406 : register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
407 : v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
408 : v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
409 : v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
410 : #endif
411 3891200 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
412 3891200 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
413 3891200 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
414 : #ifdef __ELPA_USE_FMA__
415 1945600 : register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
416 1945600 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
417 1945600 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
418 : #else
419 : register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
420 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
421 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
422 : #endif
423 3891200 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
424 3891200 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
425 3891200 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
426 : #ifdef __ELPA_USE_FMA__
427 1945600 : register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
428 1945600 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
429 1945600 : register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
430 : #else
431 : register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
432 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
433 : register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
434 : #endif
435 1945600 : register __AVX_DATATYPE x1 = a1_1;
436 :
437 3891200 : __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*5)+offset]);
438 3891200 : __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*4)+offset]);
439 3891200 : __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
440 3891200 : __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
441 3891200 : __AVX_DATATYPE a5_2 = _AVX_LOAD(&q[(ldq)+offset]);
442 3891200 : __AVX_DATATYPE a6_2 = _AVX_LOAD(&q[offset]);
443 :
444 : #ifdef __ELPA_USE_FMA__
445 1945600 : register __AVX_DATATYPE t2 = _AVX_FMA(a5_2, h_6_5, a6_2);
446 1945600 : t2 = _AVX_FMA(a4_2, h_6_4, t2);
447 1945600 : t2 = _AVX_FMA(a3_2, h_6_3, t2);
448 1945600 : t2 = _AVX_FMA(a2_2, h_6_2, t2);
449 1945600 : t2 = _AVX_FMA(a1_2, h_6_1, t2);
450 1945600 : register __AVX_DATATYPE v2 = _AVX_FMA(a4_2, h_5_4, a5_2);
451 1945600 : v2 = _AVX_FMA(a3_2, h_5_3, v2);
452 1945600 : v2 = _AVX_FMA(a2_2, h_5_2, v2);
453 1945600 : v2 = _AVX_FMA(a1_2, h_5_1, v2);
454 1945600 : register __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
455 1945600 : w2 = _AVX_FMA(a2_2, h_4_2, w2);
456 1945600 : w2 = _AVX_FMA(a1_2, h_4_1, w2);
457 1945600 : register __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
458 1945600 : z2 = _AVX_FMA(a1_2, h_3_1, z2);
459 1945600 : register __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
460 : #else
461 : register __AVX_DATATYPE t2 = _AVX_ADD(a6_2, _AVX_MUL(a5_2, h_6_5));
462 : t2 = _AVX_ADD(t2, _AVX_MUL(a4_2, h_6_4));
463 : t2 = _AVX_ADD(t2, _AVX_MUL(a3_2, h_6_3));
464 : t2 = _AVX_ADD(t2, _AVX_MUL(a2_2, h_6_2));
465 : t2 = _AVX_ADD(t2, _AVX_MUL(a1_2, h_6_1));
466 : register __AVX_DATATYPE v2 = _AVX_ADD(a5_2, _AVX_MUL(a4_2, h_5_4));
467 : v2 = _AVX_ADD(v2, _AVX_MUL(a3_2, h_5_3));
468 : v2 = _AVX_ADD(v2, _AVX_MUL(a2_2, h_5_2));
469 : v2 = _AVX_ADD(v2, _AVX_MUL(a1_2, h_5_1));
470 : register __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
471 : w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
472 : w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
473 : register __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
474 : z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
475 : register __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
476 : #endif
477 1945600 : register __AVX_DATATYPE x2 = a1_2;
478 :
479 : __AVX_DATATYPE q1;
480 : __AVX_DATATYPE q2;
481 :
482 : __AVX_DATATYPE h1;
483 : __AVX_DATATYPE h2;
484 : __AVX_DATATYPE h3;
485 : __AVX_DATATYPE h4;
486 : __AVX_DATATYPE h5;
487 : __AVX_DATATYPE h6;
488 :
489 114790400 : for(i = 6; i < nb; i++)
490 : {
491 225689600 : h1 = _AVX_BROADCAST(&hh[i-5]);
492 225689600 : q1 = _AVX_LOAD(&q[i*ldq]);
493 225689600 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
494 : #ifdef __ELPA_USE_FMA__
495 112844800 : x1 = _AVX_FMA(q1, h1, x1);
496 112844800 : x2 = _AVX_FMA(q2, h1, x2);
497 : #else
498 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
499 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
500 : #endif
501 225689600 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
502 : #ifdef __ELPA_USE_FMA__
503 112844800 : y1 = _AVX_FMA(q1, h2, y1);
504 112844800 : y2 = _AVX_FMA(q2, h2, y2);
505 : #else
506 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
507 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
508 : #endif
509 225689600 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
510 : #ifdef __ELPA_USE_FMA__
511 112844800 : z1 = _AVX_FMA(q1, h3, z1);
512 112844800 : z2 = _AVX_FMA(q2, h3, z2);
513 : #else
514 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
515 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
516 : #endif
517 225689600 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
518 : #ifdef __ELPA_USE_FMA__
519 112844800 : w1 = _AVX_FMA(q1, h4, w1);
520 112844800 : w2 = _AVX_FMA(q2, h4, w2);
521 : #else
522 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
523 : w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
524 : #endif
525 225689600 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
526 : #ifdef __ELPA_USE_FMA__
527 112844800 : v1 = _AVX_FMA(q1, h5, v1);
528 112844800 : v2 = _AVX_FMA(q2, h5, v2);
529 : #else
530 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
531 : v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
532 : #endif
533 225689600 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
534 : #ifdef __ELPA_USE_FMA__
535 112844800 : t1 = _AVX_FMA(q1, h6, t1);
536 112844800 : t2 = _AVX_FMA(q2, h6, t2);
537 : #else
538 : t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
539 : t2 = _AVX_ADD(t2, _AVX_MUL(q2,h6));
540 : #endif
541 : }
542 :
543 3891200 : h1 = _AVX_BROADCAST(&hh[nb-5]);
544 3891200 : q1 = _AVX_LOAD(&q[nb*ldq]);
545 3891200 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
546 : #ifdef __ELPA_USE_FMA__
547 1945600 : x1 = _AVX_FMA(q1, h1, x1);
548 1945600 : x2 = _AVX_FMA(q2, h1, x2);
549 : #else
550 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
551 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
552 : #endif
553 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
554 : #ifdef __ELPA_USE_FMA__
555 1945600 : y1 = _AVX_FMA(q1, h2, y1);
556 1945600 : y2 = _AVX_FMA(q2, h2, y2);
557 : #else
558 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
559 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
560 : #endif
561 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
562 : #ifdef __ELPA_USE_FMA__
563 1945600 : z1 = _AVX_FMA(q1, h3, z1);
564 1945600 : z2 = _AVX_FMA(q2, h3, z2);
565 : #else
566 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
567 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
568 : #endif
569 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
570 : #ifdef __ELPA_USE_FMA__
571 1945600 : w1 = _AVX_FMA(q1, h4, w1);
572 1945600 : w2 = _AVX_FMA(q2, h4, w2);
573 : #else
574 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
575 : w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
576 : #endif
577 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
578 : #ifdef __ELPA_USE_FMA__
579 1945600 : v1 = _AVX_FMA(q1, h5, v1);
580 1945600 : v2 = _AVX_FMA(q2, h5, v2);
581 : #else
582 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
583 : v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
584 : #endif
585 :
586 3891200 : h1 = _AVX_BROADCAST(&hh[nb-4]);
587 3891200 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
588 3891200 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
589 : #ifdef __ELPA_USE_FMA__
590 1945600 : x1 = _AVX_FMA(q1, h1, x1);
591 1945600 : x2 = _AVX_FMA(q2, h1, x2);
592 : #else
593 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
594 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
595 : #endif
596 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
597 : #ifdef __ELPA_USE_FMA__
598 1945600 : y1 = _AVX_FMA(q1, h2, y1);
599 1945600 : y2 = _AVX_FMA(q2, h2, y2);
600 : #else
601 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
602 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
603 : #endif
604 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
605 : #ifdef __ELPA_USE_FMA__
606 1945600 : z1 = _AVX_FMA(q1, h3, z1);
607 1945600 : z2 = _AVX_FMA(q2, h3, z2);
608 : #else
609 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
610 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
611 : #endif
612 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
613 : #ifdef __ELPA_USE_FMA__
614 1945600 : w1 = _AVX_FMA(q1, h4, w1);
615 1945600 : w2 = _AVX_FMA(q2, h4, w2);
616 : #else
617 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
618 : w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
619 : #endif
620 :
621 3891200 : h1 = _AVX_BROADCAST(&hh[nb-3]);
622 3891200 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
623 3891200 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
624 : #ifdef __ELPA_USE_FMA__
625 1945600 : x1 = _AVX_FMA(q1, h1, x1);
626 1945600 : x2 = _AVX_FMA(q2, h1, x2);
627 : #else
628 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
629 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
630 : #endif
631 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
632 : #ifdef __ELPA_USE_FMA__
633 1945600 : y1 = _AVX_FMA(q1, h2, y1);
634 1945600 : y2 = _AVX_FMA(q2, h2, y2);
635 : #else
636 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
637 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
638 : #endif
639 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
640 : #ifdef __ELPA_USE_FMA__
641 1945600 : z1 = _AVX_FMA(q1, h3, z1);
642 1945600 : z2 = _AVX_FMA(q2, h3, z2);
643 : #else
644 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
645 : z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
646 : #endif
647 :
648 3891200 : h1 = _AVX_BROADCAST(&hh[nb-2]);
649 3891200 : q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
650 3891200 : q2 = _AVX_LOAD(&q[((nb+3)*ldq)+offset]);
651 : #ifdef __ELPA_USE_FMA__
652 1945600 : x1 = _AVX_FMA(q1, h1, x1);
653 1945600 : x2 = _AVX_FMA(q2, h1, x2);
654 : #else
655 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
656 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
657 : #endif
658 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
659 : #ifdef __ELPA_USE_FMA__
660 1945600 : y1 = _AVX_FMA(q1, h2, y1);
661 1945600 : y2 = _AVX_FMA(q2, h2, y2);
662 : #else
663 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
664 : y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
665 : #endif
666 :
667 3891200 : h1 = _AVX_BROADCAST(&hh[nb-1]);
668 3891200 : q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
669 3891200 : q2 = _AVX_LOAD(&q[((nb+4)*ldq)+offset]);
670 : #ifdef __ELPA_USE_FMA__
671 1945600 : x1 = _AVX_FMA(q1, h1, x1);
672 1945600 : x2 = _AVX_FMA(q2, h1, x2);
673 : #else
674 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
675 : x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
676 : #endif
677 :
678 : /////////////////////////////////////////////////////
679 : // Apply tau, correct wrong calculation using pre-calculated scalar products
680 : /////////////////////////////////////////////////////
681 :
682 1945600 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
683 1945600 : x1 = _AVX_MUL(x1, tau1);
684 1945600 : x2 = _AVX_MUL(x2, tau1);
685 :
686 3891200 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
687 1945600 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
688 1945600 : h2 = _AVX_MUL(tau2, vs_1_2);
689 : #ifdef __ELPA_USE_FMA__
690 3891200 : y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
691 3891200 : y2 = _AVX_FMSUB(y2, tau2, _AVX_MUL(x2,h2));
692 : #else
693 : y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
694 : y2 = _AVX_SUB(_AVX_MUL(y2,tau2), _AVX_MUL(x2,h2));
695 : #endif
696 :
697 3891200 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
698 3891200 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
699 3891200 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
700 1945600 : h2 = _AVX_MUL(tau3, vs_1_3);
701 1945600 : h3 = _AVX_MUL(tau3, vs_2_3);
702 : #ifdef __ELPA_USE_FMA__
703 5836800 : z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
704 5836800 : z2 = _AVX_FMSUB(z2, tau3, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
705 : #else
706 : z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
707 : z2 = _AVX_SUB(_AVX_MUL(z2,tau3), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
708 : #endif
709 :
710 3891200 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
711 3891200 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
712 3891200 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
713 1945600 : h2 = _AVX_MUL(tau4, vs_1_4);
714 1945600 : h3 = _AVX_MUL(tau4, vs_2_4);
715 3891200 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
716 1945600 : h4 = _AVX_MUL(tau4, vs_3_4);
717 : #ifdef __ELPA_USE_FMA__
718 7782400 : w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
719 7782400 : w2 = _AVX_FMSUB(w2, tau4, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
720 : #else
721 : w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
722 : w2 = _AVX_SUB(_AVX_MUL(w2,tau4), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
723 : #endif
724 :
725 3891200 : __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
726 3891200 : __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
727 3891200 : __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
728 1945600 : h2 = _AVX_MUL(tau5, vs_1_5);
729 1945600 : h3 = _AVX_MUL(tau5, vs_2_5);
730 3891200 : __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
731 3891200 : __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
732 1945600 : h4 = _AVX_MUL(tau5, vs_3_5);
733 1945600 : h5 = _AVX_MUL(tau5, vs_4_5);
734 : #ifdef __ELPA_USE_FMA__
735 11673600 : v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
736 11673600 : v2 = _AVX_FMSUB(v2, tau5, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
737 : #else
738 : v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
739 : v2 = _AVX_SUB(_AVX_MUL(v2,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
740 : #endif
741 :
742 3891200 : __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
743 3891200 : __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
744 3891200 : __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
745 1945600 : h2 = _AVX_MUL(tau6, vs_1_6);
746 1945600 : h3 = _AVX_MUL(tau6, vs_2_6);
747 3891200 : __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
748 3891200 : __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
749 3891200 : __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
750 1945600 : h4 = _AVX_MUL(tau6, vs_3_6);
751 1945600 : h5 = _AVX_MUL(tau6, vs_4_6);
752 1945600 : h6 = _AVX_MUL(tau6, vs_5_6);
753 : #ifdef __ELPA_USE_FMA__
754 13619200 : t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
755 13619200 : t2 = _AVX_FMSUB(t2, tau6, _AVX_FMA(v2, h6, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)))));
756 : #else
757 : t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
758 : t2 = _AVX_SUB(_AVX_MUL(t2,tau6), _AVX_ADD( _AVX_MUL(v2,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)))));
759 : #endif
760 :
761 : /////////////////////////////////////////////////////
762 : // Rank-1 update of Q [8 x nb+3]
763 : /////////////////////////////////////////////////////
764 :
765 1945600 : q1 = _AVX_LOAD(&q[0]);
766 3891200 : q2 = _AVX_LOAD(&q[offset]);
767 1945600 : q1 = _AVX_SUB(q1, t1);
768 1945600 : q2 = _AVX_SUB(q2, t2);
769 : _AVX_STORE(&q[0],q1);
770 1945600 : _AVX_STORE(&q[offset],q2);
771 :
772 3891200 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
773 3891200 : q1 = _AVX_LOAD(&q[ldq]);
774 3891200 : q2 = _AVX_LOAD(&q[(ldq+offset)]);
775 1945600 : q1 = _AVX_SUB(q1, v1);
776 1945600 : q2 = _AVX_SUB(q2, v2);
777 : #ifdef __ELPA_USE_FMA__
778 1945600 : q1 = _AVX_NFMA(t1, h6, q1);
779 1945600 : q2 = _AVX_NFMA(t2, h6, q2);
780 : #else
781 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
782 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
783 : #endif
784 1945600 : _AVX_STORE(&q[ldq],q1);
785 1945600 : _AVX_STORE(&q[(ldq+offset)],q2);
786 :
787 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
788 3891200 : q1 = _AVX_LOAD(&q[ldq*2]);
789 3891200 : q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
790 1945600 : q1 = _AVX_SUB(q1, w1);
791 1945600 : q2 = _AVX_SUB(q2, w2);
792 : #ifdef __ELPA_USE_FMA__
793 1945600 : q1 = _AVX_NFMA(v1, h5, q1);
794 1945600 : q2 = _AVX_NFMA(v2, h5, q2);
795 : #else
796 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
797 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
798 : #endif
799 3891200 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
800 : #ifdef __ELPA_USE_FMA__
801 1945600 : q1 = _AVX_NFMA(t1, h6, q1);
802 1945600 : q2 = _AVX_NFMA(t2, h6, q2);
803 : #else
804 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
805 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
806 : #endif
807 1945600 : _AVX_STORE(&q[ldq*2],q1);
808 1945600 : _AVX_STORE(&q[(ldq*2)+offset],q2);
809 :
810 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
811 3891200 : q1 = _AVX_LOAD(&q[ldq*3]);
812 3891200 : q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
813 1945600 : q1 = _AVX_SUB(q1, z1);
814 1945600 : q2 = _AVX_SUB(q2, z2);
815 : #ifdef __ELPA_USE_FMA__
816 1945600 : q1 = _AVX_NFMA(w1, h4, q1);
817 1945600 : q2 = _AVX_NFMA(w2, h4, q2);
818 : #else
819 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
820 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
821 : #endif
822 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
823 : #ifdef __ELPA_USE_FMA__
824 1945600 : q1 = _AVX_NFMA(v1, h5, q1);
825 1945600 : q2 = _AVX_NFMA(v2, h5, q2);
826 : #else
827 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
828 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
829 : #endif
830 3891200 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
831 : #ifdef __ELPA_USE_FMA__
832 1945600 : q1 = _AVX_NFMA(t1, h6, q1);
833 1945600 : q2 = _AVX_NFMA(t2, h6, q2);
834 : #else
835 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
836 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
837 : #endif
838 1945600 : _AVX_STORE(&q[ldq*3],q1);
839 1945600 : _AVX_STORE(&q[(ldq*3)+offset],q2);
840 :
841 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
842 3891200 : q1 = _AVX_LOAD(&q[ldq*4]);
843 3891200 : q2 = _AVX_LOAD(&q[(ldq*4)+offset]);
844 1945600 : q1 = _AVX_SUB(q1, y1);
845 1945600 : q2 = _AVX_SUB(q2, y2);
846 : #ifdef __ELPA_USE_FMA__
847 1945600 : q1 = _AVX_NFMA(z1, h3, q1);
848 1945600 : q2 = _AVX_NFMA(z2, h3, q2);
849 : #else
850 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
851 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
852 : #endif
853 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
854 : #ifdef __ELPA_USE_FMA__
855 1945600 : q1 = _AVX_NFMA(w1, h4, q1);
856 1945600 : q2 = _AVX_NFMA(w2, h4, q2);
857 : #else
858 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
859 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
860 : #endif
861 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
862 : #ifdef __ELPA_USE_FMA__
863 1945600 : q1 = _AVX_NFMA(v1, h5, q1);
864 1945600 : q2 = _AVX_NFMA(v2, h5, q2);
865 : #else
866 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
867 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
868 : #endif
869 3891200 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
870 : #ifdef __ELPA_USE_FMA__
871 1945600 : q1 = _AVX_NFMA(t1, h6, q1);
872 1945600 : q2 = _AVX_NFMA(t2, h6, q2);
873 : #else
874 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
875 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
876 : #endif
877 1945600 : _AVX_STORE(&q[ldq*4],q1);
878 1945600 : _AVX_STORE(&q[(ldq*4)+offset],q2);
879 :
880 3891200 : h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
881 3891200 : q1 = _AVX_LOAD(&q[ldq*5]);
882 3891200 : q2 = _AVX_LOAD(&q[(ldq*5)+offset]);
883 1945600 : q1 = _AVX_SUB(q1, x1);
884 1945600 : q2 = _AVX_SUB(q2, x2);
885 : #ifdef __ELPA_USE_FMA__
886 1945600 : q1 = _AVX_NFMA(y1, h2, q1);
887 1945600 : q2 = _AVX_NFMA(y2, h2, q2);
888 : #else
889 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
890 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
891 : #endif
892 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
893 : #ifdef __ELPA_USE_FMA__
894 1945600 : q1 = _AVX_NFMA(z1, h3, q1);
895 1945600 : q2 = _AVX_NFMA(z2, h3, q2);
896 : #else
897 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
898 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
899 : #endif
900 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
901 : #ifdef __ELPA_USE_FMA__
902 1945600 : q1 = _AVX_NFMA(w1, h4, q1);
903 1945600 : q2 = _AVX_NFMA(w2, h4, q2);
904 : #else
905 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
906 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
907 : #endif
908 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
909 : #ifdef __ELPA_USE_FMA__
910 1945600 : q1 = _AVX_NFMA(v1, h5, q1);
911 1945600 : q2 = _AVX_NFMA(v2, h5, q2);
912 : #else
913 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
914 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
915 : #endif
916 3891200 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
917 : #ifdef __ELPA_USE_FMA__
918 1945600 : q1 = _AVX_NFMA(t1, h6, q1);
919 1945600 : q2 = _AVX_NFMA(t2, h6, q2);
920 : #else
921 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
922 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
923 : #endif
924 1945600 : _AVX_STORE(&q[ldq*5],q1);
925 1945600 : _AVX_STORE(&q[(ldq*5)+offset],q2);
926 :
927 114790400 : for (i = 6; i < nb; i++)
928 : {
929 225689600 : q1 = _AVX_LOAD(&q[i*ldq]);
930 225689600 : q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
931 225689600 : h1 = _AVX_BROADCAST(&hh[i-5]);
932 : #ifdef __ELPA_USE_FMA__
933 112844800 : q1 = _AVX_NFMA(x1, h1, q1);
934 112844800 : q2 = _AVX_NFMA(x2, h1, q2);
935 : #else
936 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
937 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
938 : #endif
939 225689600 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
940 : #ifdef __ELPA_USE_FMA__
941 112844800 : q1 = _AVX_NFMA(y1, h2, q1);
942 112844800 : q2 = _AVX_NFMA(y2, h2, q2);
943 : #else
944 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
945 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
946 : #endif
947 225689600 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
948 : #ifdef __ELPA_USE_FMA__
949 112844800 : q1 = _AVX_NFMA(z1, h3, q1);
950 112844800 : q2 = _AVX_NFMA(z2, h3, q2);
951 : #else
952 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
953 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
954 : #endif
955 225689600 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
956 : #ifdef __ELPA_USE_FMA__
957 112844800 : q1 = _AVX_NFMA(w1, h4, q1);
958 112844800 : q2 = _AVX_NFMA(w2, h4, q2);
959 : #else
960 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
961 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
962 : #endif
963 225689600 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
964 : #ifdef __ELPA_USE_FMA__
965 112844800 : q1 = _AVX_NFMA(v1, h5, q1);
966 112844800 : q2 = _AVX_NFMA(v2, h5, q2);
967 : #else
968 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
969 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
970 : #endif
971 225689600 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
972 : #ifdef __ELPA_USE_FMA__
973 112844800 : q1 = _AVX_NFMA(t1, h6, q1);
974 112844800 : q2 = _AVX_NFMA(t2, h6, q2);
975 : #else
976 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
977 : q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
978 : #endif
979 112844800 : _AVX_STORE(&q[i*ldq],q1);
980 112844800 : _AVX_STORE(&q[(i*ldq)+offset],q2);
981 : }
982 :
983 3891200 : h1 = _AVX_BROADCAST(&hh[nb-5]);
984 3891200 : q1 = _AVX_LOAD(&q[nb*ldq]);
985 3891200 : q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
986 : #ifdef __ELPA_USE_FMA__
987 1945600 : q1 = _AVX_NFMA(x1, h1, q1);
988 1945600 : q2 = _AVX_NFMA(x2, h1, q2);
989 : #else
990 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
991 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
992 : #endif
993 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
994 : #ifdef __ELPA_USE_FMA__
995 1945600 : q1 = _AVX_NFMA(y1, h2, q1);
996 1945600 : q2 = _AVX_NFMA(y2, h2, q2);
997 : #else
998 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
999 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
1000 : #endif
1001 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
1002 : #ifdef __ELPA_USE_FMA__
1003 1945600 : q1 = _AVX_NFMA(z1, h3, q1);
1004 1945600 : q2 = _AVX_NFMA(z2, h3, q2);
1005 : #else
1006 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1007 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
1008 : #endif
1009 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
1010 : #ifdef __ELPA_USE_FMA__
1011 1945600 : q1 = _AVX_NFMA(w1, h4, q1);
1012 1945600 : q2 = _AVX_NFMA(w2, h4, q2);
1013 : #else
1014 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1015 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
1016 : #endif
1017 3891200 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
1018 : #ifdef __ELPA_USE_FMA__
1019 1945600 : q1 = _AVX_NFMA(v1, h5, q1);
1020 1945600 : q2 = _AVX_NFMA(v2, h5, q2);
1021 : #else
1022 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1023 : q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
1024 : #endif
1025 1945600 : _AVX_STORE(&q[nb*ldq],q1);
1026 1945600 : _AVX_STORE(&q[(nb*ldq)+offset],q2);
1027 :
1028 3891200 : h1 = _AVX_BROADCAST(&hh[nb-4]);
1029 3891200 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1030 3891200 : q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
1031 : #ifdef __ELPA_USE_FMA__
1032 1945600 : q1 = _AVX_NFMA(x1, h1, q1);
1033 1945600 : q2 = _AVX_NFMA(x2, h1, q2);
1034 : #else
1035 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1036 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
1037 : #endif
1038 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
1039 : #ifdef __ELPA_USE_FMA__
1040 1945600 : q1 = _AVX_NFMA(y1, h2, q1);
1041 1945600 : q2 = _AVX_NFMA(y2, h2, q2);
1042 : #else
1043 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1044 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
1045 : #endif
1046 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
1047 : #ifdef __ELPA_USE_FMA__
1048 1945600 : q1 = _AVX_NFMA(z1, h3, q1);
1049 1945600 : q2 = _AVX_NFMA(z2, h3, q2);
1050 : #else
1051 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1052 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
1053 : #endif
1054 3891200 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
1055 : #ifdef __ELPA_USE_FMA__
1056 1945600 : q1 = _AVX_NFMA(w1, h4, q1);
1057 1945600 : q2 = _AVX_NFMA(w2, h4, q2);
1058 : #else
1059 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1060 : q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
1061 : #endif
1062 1945600 : _AVX_STORE(&q[(nb+1)*ldq],q1);
1063 1945600 : _AVX_STORE(&q[((nb+1)*ldq)+offset],q2);
1064 :
1065 3891200 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1066 3891200 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1067 3891200 : q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
1068 : #ifdef __ELPA_USE_FMA__
1069 1945600 : q1 = _AVX_NFMA(x1, h1, q1);
1070 1945600 : q2 = _AVX_NFMA(x2, h1, q2);
1071 : #else
1072 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1073 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
1074 : #endif
1075 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1076 : #ifdef __ELPA_USE_FMA__
1077 1945600 : q1 = _AVX_NFMA(y1, h2, q1);
1078 1945600 : q2 = _AVX_NFMA(y2, h2, q2);
1079 : #else
1080 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1081 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
1082 : #endif
1083 3891200 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1084 : #ifdef __ELPA_USE_FMA__
1085 1945600 : q1 = _AVX_NFMA(z1, h3, q1);
1086 1945600 : q2 = _AVX_NFMA(z2, h3, q2);
1087 : #else
1088 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1089 : q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
1090 : #endif
1091 1945600 : _AVX_STORE(&q[(nb+2)*ldq],q1);
1092 1945600 : _AVX_STORE(&q[((nb+2)*ldq)+offset],q2);
1093 :
1094 3891200 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1095 3891200 : q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
1096 3891200 : q2 = _AVX_LOAD(&q[((nb+3)*ldq)+offset]);
1097 : #ifdef __ELPA_USE_FMA__
1098 1945600 : q1 = _AVX_NFMA(x1, h1, q1);
1099 1945600 : q2 = _AVX_NFMA(x2, h1, q2);
1100 : #else
1101 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1102 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
1103 : #endif
1104 3891200 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1105 : #ifdef __ELPA_USE_FMA__
1106 1945600 : q1 = _AVX_NFMA(y1, h2, q1);
1107 1945600 : q2 = _AVX_NFMA(y2, h2, q2);
1108 : #else
1109 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1110 : q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
1111 : #endif
1112 1945600 : _AVX_STORE(&q[(nb+3)*ldq],q1);
1113 1945600 : _AVX_STORE(&q[((nb+3)*ldq)+offset],q2);
1114 :
1115 3891200 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1116 3891200 : q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
1117 3891200 : q2 = _AVX_LOAD(&q[((nb+4)*ldq)+offset]);
1118 : #ifdef __ELPA_USE_FMA__
1119 1945600 : q1 = _AVX_NFMA(x1, h1, q1);
1120 1945600 : q2 = _AVX_NFMA(x2, h1, q2);
1121 : #else
1122 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1123 : q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
1124 : #endif
1125 1945600 : _AVX_STORE(&q[(nb+4)*ldq],q1);
1126 1945600 : _AVX_STORE(&q[((nb+4)*ldq)+offset],q2);
1127 : }
1128 :
1129 : /**
1130 : * Unrolled kernel that computes
1131 : #ifdef DOUBLE_PRECISION_REAL
1132 : * 4 rows of Q simultaneously, a
1133 : #endif
1134 : #ifdef SINGLE_PRECISION_REAL
1135 : * 8 rows of Q simultaneously, a
1136 : #endif
1137 :
1138 : * matrix Vector product with two householder
1139 : * vectors + a rank 1 update is performed
1140 : */
1141 : #ifdef DOUBLE_PRECISION_REAL
1142 : __forceinline void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
1143 : #endif
1144 : #ifdef SINGLE_PRECISION_REAL
1145 : __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
1146 : #endif
1147 : {
1148 : /////////////////////////////////////////////////////
1149 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
1150 : // hh contains four householder vectors
1151 : /////////////////////////////////////////////////////
1152 : int i;
1153 :
1154 0 : __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*5]);
1155 0 : __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*4]);
1156 0 : __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq*3]);
1157 0 : __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[ldq*2]);
1158 0 : __AVX_DATATYPE a5_1 = _AVX_LOAD(&q[ldq]);
1159 0 : __AVX_DATATYPE a6_1 = _AVX_LOAD(&q[0]);
1160 :
1161 0 : __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
1162 0 : __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
1163 0 : __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
1164 0 : __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
1165 0 : __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
1166 : #ifdef __ELPA_USE_FMA__
1167 0 : register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
1168 0 : t1 = _AVX_FMA(a4_1, h_6_4, t1);
1169 0 : t1 = _AVX_FMA(a3_1, h_6_3, t1);
1170 0 : t1 = _AVX_FMA(a2_1, h_6_2, t1);
1171 0 : t1 = _AVX_FMA(a1_1, h_6_1, t1);
1172 : #else
1173 : register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
1174 : t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
1175 : t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
1176 : t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
1177 : t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
1178 : #endif
1179 0 : __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
1180 0 : __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
1181 0 : __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
1182 0 : __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
1183 : #ifdef __ELPA_USE_FMA__
1184 0 : register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
1185 0 : v1 = _AVX_FMA(a3_1, h_5_3, v1);
1186 0 : v1 = _AVX_FMA(a2_1, h_5_2, v1);
1187 0 : v1 = _AVX_FMA(a1_1, h_5_1, v1);
1188 : #else
1189 : register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
1190 : v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
1191 : v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
1192 : v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
1193 : #endif
1194 0 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1195 0 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1196 0 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1197 : #ifdef __ELPA_USE_FMA__
1198 0 : register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
1199 0 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
1200 0 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
1201 : #else
1202 : register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
1203 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
1204 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
1205 : #endif
1206 0 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
1207 0 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1208 0 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1209 : #ifdef __ELPA_USE_FMA__
1210 0 : register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
1211 0 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
1212 0 : register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
1213 : #else
1214 : register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
1215 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
1216 : register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
1217 : #endif
1218 0 : register __AVX_DATATYPE x1 = a1_1;
1219 :
1220 : __AVX_DATATYPE q1;
1221 :
1222 : __AVX_DATATYPE h1;
1223 : __AVX_DATATYPE h2;
1224 : __AVX_DATATYPE h3;
1225 : __AVX_DATATYPE h4;
1226 : __AVX_DATATYPE h5;
1227 : __AVX_DATATYPE h6;
1228 :
1229 0 : for(i = 6; i < nb; i++)
1230 : {
1231 0 : h1 = _AVX_BROADCAST(&hh[i-5]);
1232 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1233 : #ifdef __ELPA_USE_FMA__
1234 0 : x1 = _AVX_FMA(q1, h1, x1);
1235 : #else
1236 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1237 : #endif
1238 0 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
1239 : #ifdef __ELPA_USE_FMA__
1240 0 : y1 = _AVX_FMA(q1, h2, y1);
1241 : #else
1242 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1243 : #endif
1244 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
1245 : #ifdef __ELPA_USE_FMA__
1246 0 : z1 = _AVX_FMA(q1, h3, z1);
1247 : #else
1248 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1249 : #endif
1250 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
1251 : #ifdef __ELPA_USE_FMA__
1252 0 : w1 = _AVX_FMA(q1, h4, w1);
1253 : #else
1254 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1255 : #endif
1256 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
1257 : #ifdef __ELPA_USE_FMA__
1258 0 : v1 = _AVX_FMA(q1, h5, v1);
1259 : #else
1260 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
1261 : #endif
1262 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
1263 : #ifdef __ELPA_USE_FMA__
1264 0 : t1 = _AVX_FMA(q1, h6, t1);
1265 : #else
1266 : t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
1267 : #endif
1268 : }
1269 :
1270 0 : h1 = _AVX_BROADCAST(&hh[nb-5]);
1271 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1272 : #ifdef __ELPA_USE_FMA__
1273 0 : x1 = _AVX_FMA(q1, h1, x1);
1274 : #else
1275 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1276 : #endif
1277 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
1278 : #ifdef __ELPA_USE_FMA__
1279 0 : y1 = _AVX_FMA(q1, h2, y1);
1280 : #else
1281 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1282 : #endif
1283 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
1284 : #ifdef __ELPA_USE_FMA__
1285 0 : z1 = _AVX_FMA(q1, h3, z1);
1286 : #else
1287 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1288 : #endif
1289 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
1290 : #ifdef __ELPA_USE_FMA__
1291 0 : w1 = _AVX_FMA(q1, h4, w1);
1292 : #else
1293 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1294 : #endif
1295 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
1296 : #ifdef __ELPA_USE_FMA__
1297 0 : v1 = _AVX_FMA(q1, h5, v1);
1298 : #else
1299 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
1300 : #endif
1301 :
1302 0 : h1 = _AVX_BROADCAST(&hh[nb-4]);
1303 0 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1304 : #ifdef __ELPA_USE_FMA__
1305 0 : x1 = _AVX_FMA(q1, h1, x1);
1306 : #else
1307 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1308 : #endif
1309 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
1310 : #ifdef __ELPA_USE_FMA__
1311 0 : y1 = _AVX_FMA(q1, h2, y1);
1312 : #else
1313 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1314 : #endif
1315 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
1316 : #ifdef __ELPA_USE_FMA__
1317 0 : z1 = _AVX_FMA(q1, h3, z1);
1318 : #else
1319 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1320 : #endif
1321 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
1322 : #ifdef __ELPA_USE_FMA__
1323 0 : w1 = _AVX_FMA(q1, h4, w1);
1324 : #else
1325 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1326 : #endif
1327 :
1328 0 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1329 0 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1330 : #ifdef __ELPA_USE_FMA__
1331 0 : x1 = _AVX_FMA(q1, h1, x1);
1332 : #else
1333 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1334 : #endif
1335 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1336 : #ifdef __ELPA_USE_FMA__
1337 0 : y1 = _AVX_FMA(q1, h2, y1);
1338 : #else
1339 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1340 : #endif
1341 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1342 : #ifdef __ELPA_USE_FMA__
1343 0 : z1 = _AVX_FMA(q1, h3, z1);
1344 : #else
1345 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1346 : #endif
1347 :
1348 0 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1349 0 : q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
1350 : #ifdef __ELPA_USE_FMA__
1351 0 : x1 = _AVX_FMA(q1, h1, x1);
1352 : #else
1353 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1354 : #endif
1355 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1356 : #ifdef __ELPA_USE_FMA__
1357 0 : y1 = _AVX_FMA(q1, h2, y1);
1358 : #else
1359 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1360 : #endif
1361 :
1362 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1363 0 : q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
1364 : #ifdef __ELPA_USE_FMA__
1365 0 : x1 = _AVX_FMA(q1, h1, x1);
1366 : #else
1367 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1368 : #endif
1369 :
1370 : /////////////////////////////////////////////////////
1371 : // Apply tau, correct wrong calculation using pre-calculated scalar products
1372 : /////////////////////////////////////////////////////
1373 :
1374 0 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
1375 0 : x1 = _AVX_MUL(x1, tau1);
1376 :
1377 0 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1378 0 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
1379 0 : h2 = _AVX_MUL(tau2, vs_1_2);
1380 : #ifdef __ELPA_USE_FMA__
1381 0 : y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
1382 : #else
1383 : y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
1384 : #endif
1385 :
1386 0 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
1387 0 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
1388 0 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
1389 0 : h2 = _AVX_MUL(tau3, vs_1_3);
1390 0 : h3 = _AVX_MUL(tau3, vs_2_3);
1391 : #ifdef __ELPA_USE_FMA__
1392 0 : z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
1393 : #else
1394 : z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
1395 : #endif
1396 :
1397 0 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
1398 0 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
1399 0 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
1400 0 : h2 = _AVX_MUL(tau4, vs_1_4);
1401 0 : h3 = _AVX_MUL(tau4, vs_2_4);
1402 0 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
1403 0 : h4 = _AVX_MUL(tau4, vs_3_4);
1404 : #ifdef __ELPA_USE_FMA__
1405 0 : w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1406 : #else
1407 : w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1408 : #endif
1409 :
1410 0 : __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
1411 0 : __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
1412 0 : __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
1413 0 : h2 = _AVX_MUL(tau5, vs_1_5);
1414 0 : h3 = _AVX_MUL(tau5, vs_2_5);
1415 0 : __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
1416 0 : __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
1417 0 : h4 = _AVX_MUL(tau5, vs_3_5);
1418 0 : h5 = _AVX_MUL(tau5, vs_4_5);
1419 : #ifdef __ELPA_USE_FMA__
1420 0 : v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1421 : #else
1422 : v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1423 : #endif
1424 :
1425 0 : __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
1426 0 : __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
1427 0 : __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
1428 0 : h2 = _AVX_MUL(tau6, vs_1_6);
1429 0 : h3 = _AVX_MUL(tau6, vs_2_6);
1430 0 : __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
1431 0 : __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
1432 0 : __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
1433 0 : h4 = _AVX_MUL(tau6, vs_3_6);
1434 0 : h5 = _AVX_MUL(tau6, vs_4_6);
1435 0 : h6 = _AVX_MUL(tau6, vs_5_6);
1436 : #ifdef __ELPA_USE_FMA__
1437 0 : t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
1438 : #else
1439 : t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
1440 : #endif
1441 :
1442 : /////////////////////////////////////////////////////
1443 : // Rank-1 update of Q [4 x nb+3]
1444 : /////////////////////////////////////////////////////
1445 :
1446 0 : q1 = _AVX_LOAD(&q[0]);
1447 0 : q1 = _AVX_SUB(q1, t1);
1448 : _AVX_STORE(&q[0],q1);
1449 :
1450 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
1451 0 : q1 = _AVX_LOAD(&q[ldq]);
1452 0 : q1 = _AVX_SUB(q1, v1);
1453 : #ifdef __ELPA_USE_FMA__
1454 0 : q1 = _AVX_NFMA(t1, h6, q1);
1455 : #else
1456 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1457 : #endif
1458 0 : _AVX_STORE(&q[ldq],q1);
1459 :
1460 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
1461 0 : q1 = _AVX_LOAD(&q[ldq*2]);
1462 0 : q1 = _AVX_SUB(q1, w1);
1463 : #ifdef __ELPA_USE_FMA__
1464 0 : q1 = _AVX_NFMA(v1, h5, q1);
1465 : #else
1466 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1467 : #endif
1468 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
1469 : #ifdef __ELPA_USE_FMA__
1470 0 : q1 = _AVX_NFMA(t1, h6, q1);
1471 : #else
1472 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1473 : #endif
1474 0 : _AVX_STORE(&q[ldq*2],q1);
1475 :
1476 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1477 0 : q1 = _AVX_LOAD(&q[ldq*3]);
1478 0 : q1 = _AVX_SUB(q1, z1);
1479 : #ifdef __ELPA_USE_FMA__
1480 0 : q1 = _AVX_NFMA(w1, h4, q1);
1481 : #else
1482 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1483 : #endif
1484 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
1485 : #ifdef __ELPA_USE_FMA__
1486 0 : q1 = _AVX_NFMA(v1, h5, q1);
1487 : #else
1488 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1489 : #endif
1490 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
1491 : #ifdef __ELPA_USE_FMA__
1492 0 : q1 = _AVX_NFMA(t1, h6, q1);
1493 : #else
1494 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1495 : #endif
1496 0 : _AVX_STORE(&q[ldq*3],q1);
1497 :
1498 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1499 0 : q1 = _AVX_LOAD(&q[ldq*4]);
1500 0 : q1 = _AVX_SUB(q1, y1);
1501 : #ifdef __ELPA_USE_FMA__
1502 0 : q1 = _AVX_NFMA(z1, h3, q1);
1503 : #else
1504 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1505 : #endif
1506 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1507 : #ifdef __ELPA_USE_FMA__
1508 0 : q1 = _AVX_NFMA(w1, h4, q1);
1509 : #else
1510 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1511 : #endif
1512 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
1513 : #ifdef __ELPA_USE_FMA__
1514 0 : q1 = _AVX_NFMA(v1, h5, q1);
1515 : #else
1516 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1517 : #endif
1518 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
1519 : #ifdef __ELPA_USE_FMA__
1520 0 : q1 = _AVX_NFMA(t1, h6, q1);
1521 : #else
1522 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1523 : #endif
1524 0 : _AVX_STORE(&q[ldq*4],q1);
1525 :
1526 0 : h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
1527 0 : q1 = _AVX_LOAD(&q[ldq*5]);
1528 0 : q1 = _AVX_SUB(q1, x1);
1529 : #ifdef __ELPA_USE_FMA__
1530 0 : q1 = _AVX_NFMA(y1, h2, q1);
1531 : #else
1532 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1533 : #endif
1534 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1535 : #ifdef __ELPA_USE_FMA__
1536 0 : q1 = _AVX_NFMA(z1, h3, q1);
1537 : #else
1538 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1539 : #endif
1540 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1541 : #ifdef __ELPA_USE_FMA__
1542 0 : q1 = _AVX_NFMA(w1, h4, q1);
1543 : #else
1544 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1545 : #endif
1546 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
1547 : #ifdef __ELPA_USE_FMA__
1548 0 : q1 = _AVX_NFMA(v1, h5, q1);
1549 : #else
1550 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1551 : #endif
1552 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
1553 : #ifdef __ELPA_USE_FMA__
1554 0 : q1 = _AVX_NFMA(t1, h6, q1);
1555 : #else
1556 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1557 : #endif
1558 0 : _AVX_STORE(&q[ldq*5],q1);
1559 :
1560 0 : for (i = 6; i < nb; i++)
1561 : {
1562 0 : q1 = _AVX_LOAD(&q[i*ldq]);
1563 0 : h1 = _AVX_BROADCAST(&hh[i-5]);
1564 : #ifdef __ELPA_USE_FMA__
1565 0 : q1 = _AVX_NFMA(x1, h1, q1);
1566 : #else
1567 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1568 : #endif
1569 0 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
1570 : #ifdef __ELPA_USE_FMA__
1571 0 : q1 = _AVX_NFMA(y1, h2, q1);
1572 : #else
1573 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1574 : #endif
1575 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
1576 : #ifdef __ELPA_USE_FMA__
1577 0 : q1 = _AVX_NFMA(z1, h3, q1);
1578 : #else
1579 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1580 : #endif
1581 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
1582 : #ifdef __ELPA_USE_FMA__
1583 0 : q1 = _AVX_NFMA(w1, h4, q1);
1584 : #else
1585 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1586 : #endif
1587 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
1588 : #ifdef __ELPA_USE_FMA__
1589 0 : q1 = _AVX_NFMA(v1, h5, q1);
1590 : #else
1591 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1592 : #endif
1593 0 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
1594 : #ifdef __ELPA_USE_FMA__
1595 0 : q1 = _AVX_NFMA(t1, h6, q1);
1596 : #else
1597 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
1598 : #endif
1599 0 : _AVX_STORE(&q[i*ldq],q1);
1600 : }
1601 :
1602 0 : h1 = _AVX_BROADCAST(&hh[nb-5]);
1603 0 : q1 = _AVX_LOAD(&q[nb*ldq]);
1604 : #ifdef __ELPA_USE_FMA__
1605 0 : q1 = _AVX_NFMA(x1, h1, q1);
1606 : #else
1607 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1608 : #endif
1609 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
1610 : #ifdef __ELPA_USE_FMA__
1611 0 : q1 = _AVX_NFMA(y1, h2, q1);
1612 : #else
1613 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1614 : #endif
1615 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
1616 : #ifdef __ELPA_USE_FMA__
1617 0 : q1 = _AVX_NFMA(z1, h3, q1);
1618 : #else
1619 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1620 : #endif
1621 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
1622 : #ifdef __ELPA_USE_FMA__
1623 0 : q1 = _AVX_NFMA(w1, h4, q1);
1624 : #else
1625 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1626 : #endif
1627 0 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
1628 : #ifdef __ELPA_USE_FMA__
1629 0 : q1 = _AVX_NFMA(v1, h5, q1);
1630 : #else
1631 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
1632 : #endif
1633 0 : _AVX_STORE(&q[nb*ldq],q1);
1634 :
1635 0 : h1 = _AVX_BROADCAST(&hh[nb-4]);
1636 0 : q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
1637 : #ifdef __ELPA_USE_FMA__
1638 0 : q1 = _AVX_NFMA(x1, h1, q1);
1639 : #else
1640 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1641 : #endif
1642 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
1643 : #ifdef __ELPA_USE_FMA__
1644 0 : q1 = _AVX_NFMA(y1, h2, q1);
1645 : #else
1646 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1647 : #endif
1648 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
1649 : #ifdef __ELPA_USE_FMA__
1650 0 : q1 = _AVX_NFMA(z1, h3, q1);
1651 : #else
1652 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1653 : #endif
1654 0 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
1655 : #ifdef __ELPA_USE_FMA__
1656 0 : q1 = _AVX_NFMA(w1, h4, q1);
1657 : #else
1658 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
1659 : #endif
1660 0 : _AVX_STORE(&q[(nb+1)*ldq],q1);
1661 :
1662 0 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1663 0 : q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
1664 : #ifdef __ELPA_USE_FMA__
1665 0 : q1 = _AVX_NFMA(x1, h1, q1);
1666 : #else
1667 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1668 : #endif
1669 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1670 : #ifdef __ELPA_USE_FMA__
1671 0 : q1 = _AVX_NFMA(y1, h2, q1);
1672 : #else
1673 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1674 : #endif
1675 0 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1676 : #ifdef __ELPA_USE_FMA__
1677 0 : q1 = _AVX_NFMA(z1, h3, q1);
1678 : #else
1679 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
1680 : #endif
1681 0 : _AVX_STORE(&q[(nb+2)*ldq],q1);
1682 :
1683 0 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1684 0 : q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
1685 : #ifdef __ELPA_USE_FMA__
1686 0 : q1 = _AVX_NFMA(x1, h1, q1);
1687 : #else
1688 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1689 : #endif
1690 0 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1691 : #ifdef __ELPA_USE_FMA__
1692 0 : q1 = _AVX_NFMA(y1, h2, q1);
1693 : #else
1694 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
1695 : #endif
1696 0 : _AVX_STORE(&q[(nb+3)*ldq],q1);
1697 :
1698 0 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1699 0 : q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
1700 : #ifdef __ELPA_USE_FMA__
1701 0 : q1 = _AVX_NFMA(x1, h1, q1);
1702 : #else
1703 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
1704 : #endif
1705 0 : _AVX_STORE(&q[(nb+4)*ldq],q1);
1706 : }
1707 :
1708 : #if 0
1709 : #ifdef SINGLE_PRECISION_REAL
1710 : /**
1711 : * Unrolled kernel that computes
1712 : * 4 rows of Q simultaneously, a
1713 : * matrix Vector product with two householder
1714 : * vectors + a rank 1 upsate is performed
1715 : */
1716 : __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
1717 : {
1718 : /////////////////////////////////////////////////////
1719 : // Matrix Vector Multiplication, Q [8 x nb+3] * hh
1720 : // hh contains four householder vectors
1721 : /////////////////////////////////////////////////////
1722 : int i;
1723 :
1724 : __AVX_DATATYPE a1_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5]));
1725 : __AVX_DATATYPE a2_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4]));
1726 : __AVX_DATATYPE a3_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3]));
1727 : __AVX_DATATYPE a4_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2]));
1728 : __AVX_DATATYPE a5_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq]));
1729 : // we just want to load 4 floats not 8
1730 : __AVX_DATATYPE a6_1 = _mm256_castps128_ps256(_mm_load_ps(&q[0])); // q(1,1) | q(2,1) | q(3,1) | q(4,1)
1731 :
1732 : __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
1733 : __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
1734 : __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
1735 : __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
1736 : __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
1737 : #ifdef __ELPA_USE_FMA__
1738 : register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
1739 : t1 = _AVX_FMA(a4_1, h_6_4, t1);
1740 : t1 = _AVX_FMA(a3_1, h_6_3, t1);
1741 : t1 = _AVX_FMA(a2_1, h_6_2, t1);
1742 : t1 = _AVX_FMA(a1_1, h_6_1, t1);
1743 : #else
1744 : register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
1745 : t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
1746 : t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
1747 : t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
1748 : t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
1749 : #endif
1750 : __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
1751 : __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
1752 : __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
1753 : __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
1754 : #ifdef __ELPA_USE_FMA__
1755 : register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
1756 : v1 = _AVX_FMA(a3_1, h_5_3, v1);
1757 : v1 = _AVX_FMA(a2_1, h_5_2, v1);
1758 : v1 = _AVX_FMA(a1_1, h_5_1, v1);
1759 : #else
1760 : register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
1761 : v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
1762 : v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
1763 : v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
1764 : #endif
1765 : __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
1766 : __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
1767 : __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
1768 : #ifdef __ELPA_USE_FMA__
1769 : register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
1770 : w1 = _AVX_FMA(a2_1, h_4_2, w1);
1771 : w1 = _AVX_FMA(a1_1, h_4_1, w1);
1772 : #else
1773 : register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
1774 : w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
1775 : w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
1776 : #endif
1777 : __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
1778 : __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
1779 : __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
1780 : #ifdef __ELPA_USE_FMA__
1781 : register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
1782 : z1 = _AVX_FMA(a1_1, h_3_1, z1);
1783 : register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
1784 : #else
1785 : register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
1786 : z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
1787 : register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
1788 : #endif
1789 : register __AVX_DATATYPE x1 = a1_1;
1790 :
1791 : __AVX_DATATYPE q1;
1792 :
1793 : __AVX_DATATYPE h1;
1794 : __AVX_DATATYPE h2;
1795 : __AVX_DATATYPE h3;
1796 : __AVX_DATATYPE h4;
1797 : __AVX_DATATYPE h5;
1798 : __AVX_DATATYPE h6;
1799 :
1800 : for(i = 6; i < nb; i++)
1801 : {
1802 : h1 = _AVX_BROADCAST(&hh[i-5]);
1803 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); // q(1,i) | q(2,i) ... | q(4,i)
1804 : #ifdef __ELPA_USE_FMA__
1805 : x1 = _AVX_FMA(q1, h1, x1);
1806 : #else
1807 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1808 : #endif
1809 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
1810 : #ifdef __ELPA_USE_FMA__
1811 : y1 = _AVX_FMA(q1, h2, y1);
1812 : #else
1813 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1814 : #endif
1815 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
1816 : #ifdef __ELPA_USE_FMA__
1817 : z1 = _AVX_FMA(q1, h3, z1);
1818 : #else
1819 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1820 : #endif
1821 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
1822 : #ifdef __ELPA_USE_FMA__
1823 : w1 = _AVX_FMA(q1, h4, w1);
1824 : #else
1825 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1826 : #endif
1827 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
1828 : #ifdef __ELPA_USE_FMA__
1829 : v1 = _AVX_FMA(q1, h5, v1);
1830 : #else
1831 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
1832 : #endif
1833 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
1834 : #ifdef __ELPA_USE_FMA__
1835 : t1 = _AVX_FMA(q1, h6, t1);
1836 : #else
1837 : t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
1838 : #endif
1839 : }
1840 :
1841 : h1 = _AVX_BROADCAST(&hh[nb-5]);
1842 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq]));
1843 : #ifdef __ELPA_USE_FMA__
1844 : x1 = _AVX_FMA(q1, h1, x1);
1845 : #else
1846 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1847 : #endif
1848 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
1849 : #ifdef __ELPA_USE_FMA__
1850 : y1 = _AVX_FMA(q1, h2, y1);
1851 : #else
1852 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1853 : #endif
1854 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
1855 : #ifdef __ELPA_USE_FMA__
1856 : z1 = _AVX_FMA(q1, h3, z1);
1857 : #else
1858 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1859 : #endif
1860 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
1861 : #ifdef __ELPA_USE_FMA__
1862 : w1 = _AVX_FMA(q1, h4, w1);
1863 : #else
1864 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1865 : #endif
1866 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
1867 : #ifdef __ELPA_USE_FMA__
1868 : v1 = _AVX_FMA(q1, h5, v1);
1869 : #else
1870 : v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
1871 : #endif
1872 :
1873 : h1 = _AVX_BROADCAST(&hh[nb-4]);
1874 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq]));
1875 : #ifdef __ELPA_USE_FMA__
1876 : x1 = _AVX_FMA(q1, h1, x1);
1877 : #else
1878 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1879 : #endif
1880 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
1881 : #ifdef __ELPA_USE_FMA__
1882 : y1 = _AVX_FMA(q1, h2, y1);
1883 : #else
1884 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1885 : #endif
1886 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
1887 : #ifdef __ELPA_USE_FMA__
1888 : z1 = _AVX_FMA(q1, h3, z1);
1889 : #else
1890 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1891 : #endif
1892 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
1893 : #ifdef __ELPA_USE_FMA__
1894 : w1 = _AVX_FMA(q1, h4, w1);
1895 : #else
1896 : w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
1897 : #endif
1898 :
1899 : h1 = _AVX_BROADCAST(&hh[nb-3]);
1900 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq]));
1901 : #ifdef __ELPA_USE_FMA__
1902 : x1 = _AVX_FMA(q1, h1, x1);
1903 : #else
1904 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1905 : #endif
1906 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
1907 : #ifdef __ELPA_USE_FMA__
1908 : y1 = _AVX_FMA(q1, h2, y1);
1909 : #else
1910 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1911 : #endif
1912 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
1913 : #ifdef __ELPA_USE_FMA__
1914 : z1 = _AVX_FMA(q1, h3, z1);
1915 : #else
1916 : z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
1917 : #endif
1918 :
1919 : h1 = _AVX_BROADCAST(&hh[nb-2]);
1920 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq]));
1921 : #ifdef __ELPA_USE_FMA__
1922 : x1 = _AVX_FMA(q1, h1, x1);
1923 : #else
1924 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1925 : #endif
1926 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
1927 : #ifdef __ELPA_USE_FMA__
1928 : y1 = _AVX_FMA(q1, h2, y1);
1929 : #else
1930 : y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
1931 : #endif
1932 :
1933 : h1 = _AVX_BROADCAST(&hh[nb-1]);
1934 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq]));
1935 : #ifdef __ELPA_USE_FMA__
1936 : x1 = _AVX_FMA(q1, h1, x1);
1937 : #else
1938 : x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
1939 : #endif
1940 :
1941 : /////////////////////////////////////////////////////
1942 : // Apply tau, correct wrong calculation using pre-calculated scalar products
1943 : /////////////////////////////////////////////////////
1944 :
1945 : __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
1946 : x1 = _AVX_MUL(x1, tau1);
1947 :
1948 : __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
1949 : __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
1950 : h2 = _AVX_MUL(tau2, vs_1_2);
1951 : #ifdef __ELPA_USE_FMA__
1952 : y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
1953 : #else
1954 : y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
1955 : #endif
1956 :
1957 : __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
1958 : __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
1959 : __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
1960 : h2 = _AVX_MUL(tau3, vs_1_3);
1961 : h3 = _AVX_MUL(tau3, vs_2_3);
1962 : #ifdef __ELPA_USE_FMA__
1963 : z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
1964 : #else
1965 : z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
1966 : #endif
1967 :
1968 : __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
1969 : __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
1970 : __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
1971 : h2 = _AVX_MUL(tau4, vs_1_4);
1972 : h3 = _AVX_MUL(tau4, vs_2_4);
1973 : __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
1974 : h4 = _AVX_MUL(tau4, vs_3_4);
1975 : #ifdef __ELPA_USE_FMA__
1976 : w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1977 : #else
1978 : w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1979 : #endif
1980 :
1981 : __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
1982 : __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
1983 : __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
1984 : h2 = _AVX_MUL(tau5, vs_1_5);
1985 : h3 = _AVX_MUL(tau5, vs_2_5);
1986 : __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
1987 : __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
1988 : h4 = _AVX_MUL(tau5, vs_3_5);
1989 : h5 = _AVX_MUL(tau5, vs_4_5);
1990 : #ifdef __ELPA_USE_FMA__
1991 : v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
1992 : #else
1993 : v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
1994 : #endif
1995 :
1996 : __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
1997 : __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
1998 : __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
1999 : h2 = _AVX_MUL(tau6, vs_1_6);
2000 : h3 = _AVX_MUL(tau6, vs_2_6);
2001 : __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
2002 : __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
2003 : __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
2004 : h4 = _AVX_MUL(tau6, vs_3_6);
2005 : h5 = _AVX_MUL(tau6, vs_4_6);
2006 : h6 = _AVX_MUL(tau6, vs_5_6);
2007 : #ifdef __ELPA_USE_FMA__
2008 : t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
2009 : #else
2010 : t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
2011 : #endif
2012 :
2013 : /////////////////////////////////////////////////////
2014 : // Rank-1 upsate of Q [4 x nb+3]
2015 : /////////////////////////////////////////////////////
2016 :
2017 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[0]));
2018 : q1 = _AVX_SUB(q1, t1);
2019 : _mm_store_ps(&q[0], _mm256_castps256_ps128(q1));
2020 :
2021 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
2022 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq]));
2023 : q1 = _AVX_SUB(q1, v1);
2024 : #ifdef __ELPA_USE_FMA__
2025 : q1 = _AVX_NFMA(t1, h6, q1);
2026 : #else
2027 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2028 : #endif
2029 : _mm_store_ps(&q[ldq], _mm256_castps256_ps128(q1));
2030 :
2031 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
2032 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2]));
2033 : q1 = _AVX_SUB(q1, w1);
2034 : #ifdef __ELPA_USE_FMA__
2035 : q1 = _AVX_NFMA(v1, h5, q1);
2036 : #else
2037 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2038 : #endif
2039 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
2040 : #ifdef __ELPA_USE_FMA__
2041 : q1 = _AVX_NFMA(t1, h6, q1);
2042 : #else
2043 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2044 : #endif
2045 : _mm_store_ps(&q[ldq*2], _mm256_castps256_ps128(q1));
2046 :
2047 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
2048 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3]));
2049 : q1 = _AVX_SUB(q1, z1);
2050 : #ifdef __ELPA_USE_FMA__
2051 : q1 = _AVX_NFMA(w1, h4, q1);
2052 : #else
2053 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2054 : #endif
2055 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
2056 : #ifdef __ELPA_USE_FMA__
2057 : q1 = _AVX_NFMA(v1, h5, q1);
2058 : #else
2059 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2060 : #endif
2061 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
2062 : #ifdef __ELPA_USE_FMA__
2063 : q1 = _AVX_NFMA(t1, h6, q1);
2064 : #else
2065 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2066 : #endif
2067 : _mm_store_ps(&q[ldq*3], _mm256_castps256_ps128(q1));
2068 :
2069 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
2070 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4]));
2071 : q1 = _AVX_SUB(q1, y1);
2072 : #ifdef __ELPA_USE_FMA__
2073 : q1 = _AVX_NFMA(z1, h3, q1);
2074 : #else
2075 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2076 : #endif
2077 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
2078 : #ifdef __ELPA_USE_FMA__
2079 : q1 = _AVX_NFMA(w1, h4, q1);
2080 : #else
2081 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2082 : #endif
2083 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
2084 : #ifdef __ELPA_USE_FMA__
2085 : q1 = _AVX_NFMA(v1, h5, q1);
2086 : #else
2087 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2088 : #endif
2089 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
2090 : #ifdef __ELPA_USE_FMA__
2091 : q1 = _AVX_NFMA(t1, h6, q1);
2092 : #else
2093 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2094 : #endif
2095 : _mm_store_ps(&q[ldq*4], _mm256_castps256_ps128(q1));
2096 :
2097 : h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
2098 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5]));
2099 : q1 = _AVX_SUB(q1, x1);
2100 : #ifdef __ELPA_USE_FMA__
2101 : q1 = _AVX_NFMA(y1, h2, q1);
2102 : #else
2103 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2104 : #endif
2105 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
2106 : #ifdef __ELPA_USE_FMA__
2107 : q1 = _AVX_NFMA(z1, h3, q1);
2108 : #else
2109 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2110 : #endif
2111 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
2112 : #ifdef __ELPA_USE_FMA__
2113 : q1 = _AVX_NFMA(w1, h4, q1);
2114 : #else
2115 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2116 : #endif
2117 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
2118 : #ifdef __ELPA_USE_FMA__
2119 : q1 = _AVX_NFMA(v1, h5, q1);
2120 : #else
2121 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2122 : #endif
2123 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
2124 : #ifdef __ELPA_USE_FMA__
2125 : q1 = _AVX_NFMA(t1, h6, q1);
2126 : #else
2127 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2128 : #endif
2129 : _mm_store_ps(&q[ldq*5], _mm256_castps256_ps128(q1));
2130 :
2131 : for (i = 6; i < nb; i++)
2132 : {
2133 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq]));
2134 : h1 = _AVX_BROADCAST(&hh[i-5]);
2135 : #ifdef __ELPA_USE_FMA__
2136 : q1 = _AVX_NFMA(x1, h1, q1);
2137 : #else
2138 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2139 : #endif
2140 : h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
2141 : #ifdef __ELPA_USE_FMA__
2142 : q1 = _AVX_NFMA(y1, h2, q1);
2143 : #else
2144 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2145 : #endif
2146 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
2147 : #ifdef __ELPA_USE_FMA__
2148 : q1 = _AVX_NFMA(z1, h3, q1);
2149 : #else
2150 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2151 : #endif
2152 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
2153 : #ifdef __ELPA_USE_FMA__
2154 : q1 = _AVX_NFMA(w1, h4, q1);
2155 : #else
2156 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2157 : #endif
2158 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
2159 : #ifdef __ELPA_USE_FMA__
2160 : q1 = _AVX_NFMA(v1, h5, q1);
2161 : #else
2162 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2163 : #endif
2164 : h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
2165 : #ifdef __ELPA_USE_FMA__
2166 : q1 = _AVX_NFMA(t1, h6, q1);
2167 : #else
2168 : q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
2169 : #endif
2170 : _mm_store_ps(&q[i*ldq], _mm256_castps256_ps128(q1));
2171 : }
2172 :
2173 : h1 = _AVX_BROADCAST(&hh[nb-5]);
2174 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq]));
2175 : #ifdef __ELPA_USE_FMA__
2176 : q1 = _AVX_NFMA(x1, h1, q1);
2177 : #else
2178 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2179 : #endif
2180 : h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
2181 : #ifdef __ELPA_USE_FMA__
2182 : q1 = _AVX_NFMA(y1, h2, q1);
2183 : #else
2184 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2185 : #endif
2186 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
2187 : #ifdef __ELPA_USE_FMA__
2188 : q1 = _AVX_NFMA(z1, h3, q1);
2189 : #else
2190 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2191 : #endif
2192 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
2193 : #ifdef __ELPA_USE_FMA__
2194 : q1 = _AVX_NFMA(w1, h4, q1);
2195 : #else
2196 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2197 : #endif
2198 : h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
2199 : #ifdef __ELPA_USE_FMA__
2200 : q1 = _AVX_NFMA(v1, h5, q1);
2201 : #else
2202 : q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
2203 : #endif
2204 : _mm_store_ps(&q[nb*ldq], _mm256_castps256_ps128(q1));
2205 :
2206 : h1 = _AVX_BROADCAST(&hh[nb-4]);
2207 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq]));
2208 : #ifdef __ELPA_USE_FMA__
2209 : q1 = _AVX_NFMA(x1, h1, q1);
2210 : #else
2211 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2212 : #endif
2213 : h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
2214 : #ifdef __ELPA_USE_FMA__
2215 : q1 = _AVX_NFMA(y1, h2, q1);
2216 : #else
2217 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2218 : #endif
2219 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
2220 : #ifdef __ELPA_USE_FMA__
2221 : q1 = _AVX_NFMA(z1, h3, q1);
2222 : #else
2223 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2224 : #endif
2225 : h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
2226 : #ifdef __ELPA_USE_FMA__
2227 : q1 = _AVX_NFMA(w1, h4, q1);
2228 : #else
2229 : q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
2230 : #endif
2231 : _mm_store_ps(&q[(nb+1)*ldq], _mm256_castps256_ps128(q1));
2232 :
2233 : h1 = _AVX_BROADCAST(&hh[nb-3]);
2234 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq]));
2235 : #ifdef __ELPA_USE_FMA__
2236 : q1 = _AVX_NFMA(x1, h1, q1);
2237 : #else
2238 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2239 : #endif
2240 : h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
2241 : #ifdef __ELPA_USE_FMA__
2242 : q1 = _AVX_NFMA(y1, h2, q1);
2243 : #else
2244 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2245 : #endif
2246 : h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
2247 : #ifdef __ELPA_USE_FMA__
2248 : q1 = _AVX_NFMA(z1, h3, q1);
2249 : #else
2250 : q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
2251 : #endif
2252 : _mm_store_ps(&q[(nb+2)*ldq], _mm256_castps256_ps128(q1));
2253 :
2254 : h1 = _AVX_BROADCAST(&hh[nb-2]);
2255 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq]));
2256 : #ifdef __ELPA_USE_FMA__
2257 : q1 = _AVX_NFMA(x1, h1, q1);
2258 : #else
2259 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2260 : #endif
2261 : h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
2262 : #ifdef __ELPA_USE_FMA__
2263 : q1 = _AVX_NFMA(y1, h2, q1);
2264 : #else
2265 : q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
2266 : #endif
2267 : _mm_store_ps(&q[(nb+3)*ldq], _mm256_castps256_ps128(q1));
2268 :
2269 : h1 = _AVX_BROADCAST(&hh[nb-1]);
2270 : q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq]));
2271 : #ifdef __ELPA_USE_FMA__
2272 : q1 = _AVX_NFMA(x1, h1, q1);
2273 : #else
2274 : q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
2275 : #endif
2276 : _mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1));
2277 : }
2278 : #endif
2279 : #endif
|