Line data Source code
1 : // This file is part of ELPA.
2 : //
3 : // The ELPA library was originally created by the ELPA consortium,
4 : // consisting of the following organizations:
5 : //
6 : // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : // Informatik,
10 : // - Technische Universität München, Lehrstuhl für Informatik mit
11 : // Schwerpunkt Wissenschaftliches Rechnen ,
12 : // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : // and
16 : // - IBM Deutschland GmbH
17 : //
18 : // This particular source code file contains additions, changes and
19 : // enhancements authored by Intel Corporation which is not part of
20 : // the ELPA consortium.
21 : //
22 : // More information can be found here:
23 : // http://elpa.mpcdf.mpg.de/
24 : //
25 : // ELPA is free software: you can redistribute it and/or modify
26 : // it under the terms of the version 3 of the license of the
27 : // GNU Lesser General Public License as published by the Free
28 : // Software Foundation.
29 : //
30 : // ELPA is distributed in the hope that it will be useful,
31 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : // GNU Lesser General Public License for more details.
34 : //
35 : // You should have received a copy of the GNU Lesser General Public License
36 : // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : //
38 : // ELPA reflects a substantial effort on the part of the original
39 : // ELPA consortium, and we ask you to respect the spirit of the
40 : // license that we chose: i.e., please contribute any changes you
41 : // may have back to the original ELPA library distribution, and keep
42 : // any derivatives of ELPA under the same license that we chose for
43 : // the original distribution, the GNU Lesser General Public License.
44 : //
45 : //
46 : // --------------------------------------------------------------------------------------------------
47 : //
48 : // This file contains the compute intensive kernels for the Householder transformations.
49 : // It should be compiled with the highest possible optimization level.
50 : //
51 : // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
52 : // On Intel Sandy Bridge use -O3 -mavx
53 : //
54 : // Copyright of the original code rests with the authors inside the ELPA
55 : // consortium. The copyright of any additional modifications shall rest
56 : // with their original authors, but shall adhere to the licensing terms
57 : // distributed along with the original code in the file "COPYING".
58 : //
59 : // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60 : // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61 : // --------------------------------------------------------------------------------------------------
62 : #include "config-f90.h"
63 :
64 : #include <complex.h>
65 : #include <x86intrin.h>
66 : #include <pmmintrin.h>
67 :
68 : #define __forceinline __attribute__((always_inline))
69 :
70 : #ifdef HAVE_SSE_INTRINSICS
71 : #undef __AVX__
72 : #endif
73 :
74 : #ifdef DOUBLE_PRECISION_COMPLEX
75 : #define offset 2
76 : #define __SSE_DATATYPE __m128d
77 : #define _SSE_LOAD _mm_load_pd
78 : #define _SSE_LOADU _mm_loadu_pd
79 : #define _SSE_STORE _mm_store_pd
80 : #define _SSE_STOREU _mm_storeu_pd
81 : #define _SSE_ADD _mm_add_pd
82 : #define _SSE_XOR _mm_xor_pd
83 : #define _SSE_ADDSUB _mm_addsub_pd
84 : #define _SSE_MUL _mm_mul_pd
85 : #define _SSE_SHUFFLE _mm_shuffle_pd
86 : #define _SHUFFLE _MM_SHUFFLE2(0,1)
87 : #endif
88 :
89 : #ifdef SINGLE_PRECISION_COMPLEX
90 : #define offset 4
91 : #define __SSE_DATATYPE __m128
92 : #define _SSE_LOAD _mm_load_ps
93 : #define _SSE_LOADU _mm_loadu_ps
94 : #define _SSE_STORE _mm_store_ps
95 : #define _SSE_STOREU _mm_storeu_ps
96 : #define _SSE_ADD _mm_add_ps
97 : #define _SSE_XOR _mm_xor_ps
98 : #define _SSE_ADDSUB _mm_addsub_ps
99 : #define _SSE_MUL _mm_mul_ps
100 : #define _SSE_SHUFFLE _mm_shuffle_ps
101 : #define _SHUFFLE 0xb1
102 : #endif
103 :
104 :
105 :
106 : #ifdef DOUBLE_PRECISION_COMPLEX
107 : //Forward declaration
108 : static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
109 : #if 0
110 : static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
111 : static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
112 : static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
113 : #endif
114 : #endif
115 : #ifdef SINGLE_PRECISION_COMPLEX
116 : static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
117 : #endif
118 :
119 : #ifdef DOUBLE_PRECISION_COMPLEX
120 : /*
121 : !f>#ifdef HAVE_SSE_INTRINSICS
122 : !f> interface
123 : !f> subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
124 : !f> bind(C, name="double_hh_trafo_complex_sse_2hv_double")
125 : !f> use, intrinsic :: iso_c_binding
126 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
127 : !f> ! complex(kind=c_double_complex) :: q(*)
128 : !f> type(c_ptr), value :: q
129 : !f> complex(kind=c_double_complex) :: hh(pnb,2)
130 : !f> end subroutine
131 : !f> end interface
132 : !f>#endif
133 : */
134 : #endif
135 : #ifdef SINGLE_PRECISION_COMPLEX
136 : /*
137 : !f>#ifdef HAVE_SSE_INTRINSICS
138 : !f> interface
139 : !f> subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
140 : !f> bind(C, name="double_hh_trafo_complex_sse_2hv_single")
141 : !f> use, intrinsic :: iso_c_binding
142 : !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
143 : !f> ! complex(kind=c_float_complex) :: q(*)
144 : !f> type(c_ptr), value :: q
145 : !f> complex(kind=c_float_complex) :: hh(pnb,2)
146 : !f> end subroutine
147 : !f> end interface
148 : !f>#endif
149 : */
150 : #endif
151 :
152 : #ifdef DOUBLE_PRECISION_COMPLEX
153 654336 : void double_hh_trafo_complex_sse_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
154 : #endif
155 : #ifdef SINGLE_PRECISION_COMPLEX
156 327168 : void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
157 : #endif
158 : {
159 : int i;
160 981504 : int nb = *pnb;
161 981504 : int nq = *pldq;
162 981504 : int ldq = *pldq;
163 981504 : int ldh = *pldh;
164 : #ifdef DOUBLE_PRECISION_COMPLEX
165 654336 : double complex s = conj(hh[(ldh)+1])*1.0;
166 : #endif
167 : #ifdef SINGLE_PRECISION_COMPLEX
168 327168 : float complex s = conj(hh[(ldh)+1])*1.0f;
169 : #endif
170 30426624 : for (i = 2; i < nb; i++)
171 : {
172 29445120 : s += hh[i-1] * conj(hh[(i+ldh)]);
173 : }
174 :
175 10796544 : for (i = 0; i < nq; i+=4)
176 : {
177 : #ifdef DOUBLE_PRECISION_COMPLEX
178 6543360 : hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
179 : #endif
180 : #ifdef SINGLE_PRECISION_COMPLEX
181 3271680 : hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
182 : #endif
183 : }
184 981504 : }
185 : #ifdef DOUBLE_PRECISION_COMPLEX
186 : static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
187 : #endif
188 : #ifdef SINGLE_PRECISION_COMPLEX
189 : static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
190 : #endif
191 : {
192 : #ifdef DOUBLE_PRECISION_COMPLEX
193 6543360 : double* q_dbl = (double*)q;
194 6543360 : double* hh_dbl = (double*)hh;
195 6543360 : double* s_dbl = (double*)(&s);
196 : #endif
197 : #ifdef SINGLE_PRECISION_COMPLEX
198 3271680 : float* q_dbl = (float*)q;
199 3271680 : float* hh_dbl = (float*)hh;
200 3271680 : float* s_dbl = (float*)(&s);
201 : #endif
202 : __SSE_DATATYPE x1, x2, x3, x4;
203 : __SSE_DATATYPE y1, y2, y3, y4;
204 : __SSE_DATATYPE q1, q2, q3, q4;
205 : __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
206 : __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4;
207 9815040 : int i=0;
208 :
209 : #ifdef DOUBLE_PRECISION_COMPLEX
210 6543360 : __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
211 : #endif
212 : #ifdef SINGLE_PRECISION_COMPLEX
213 3271680 : __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
214 : #endif
215 :
216 19630080 : x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
217 19630080 : x2 = _SSE_LOAD(&q_dbl[(2*ldq)+offset]);
218 : #ifdef DOUBLE_PRECISION_COMPLEX
219 13086720 : x3 = _SSE_LOAD(&q_dbl[(2*ldq)+2*offset]);
220 13086720 : x4 = _SSE_LOAD(&q_dbl[(2*ldq)+3*offset]);
221 : #endif
222 :
223 : #ifdef DOUBLE_PRECISION_COMPLEX
224 13086720 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
225 13086720 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
226 : #endif
227 : #ifdef SINGLE_PRECISION_COMPLEX
228 13086720 : h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
229 13086720 : h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
230 : #endif
231 :
232 : #ifndef __ELPA_USE_FMA__
233 : // conjugate
234 9815040 : h2_imag = _SSE_XOR(h2_imag, sign);
235 : #endif
236 :
237 9815040 : y1 = _SSE_LOAD(&q_dbl[0]);
238 19630080 : y2 = _SSE_LOAD(&q_dbl[offset]);
239 : #ifdef DOUBLE_PRECISION_COMPLEX
240 13086720 : y3 = _SSE_LOAD(&q_dbl[2*offset]);
241 13086720 : y4 = _SSE_LOAD(&q_dbl[3*offset]);
242 : #endif
243 :
244 9815040 : tmp1 = _SSE_MUL(h2_imag, x1);
245 : #ifdef __ELPA_USE_FMA__
246 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
247 : #else
248 39260160 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
249 : #endif
250 9815040 : tmp2 = _SSE_MUL(h2_imag, x2);
251 : #ifdef __ELPA_USE_FMA__
252 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
253 : #else
254 39260160 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
255 : #endif
256 :
257 : #ifdef DOUBLE_PRECISION_COMPLEX
258 6543360 : tmp3 = _SSE_MUL(h2_imag, x3);
259 : #ifdef __ELPA_USE_FMA__
260 : y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
261 : #else
262 26173440 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
263 : #endif
264 6543360 : tmp4 = _SSE_MUL(h2_imag, x4);
265 : #ifdef __ELPA_USE_FMA__
266 : y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
267 : #else
268 26173440 : y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
269 : #endif
270 : #endif /* DOUBLE_PRECISION_COMPLEX */
271 :
272 304266240 : for (i = 2; i < nb; i++)
273 : {
274 588902400 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
275 588902400 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
276 : #ifdef DOUBLE_PRECISION_COMPLEX
277 392601600 : q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
278 392601600 : q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
279 : #endif
280 :
281 : #ifdef DOUBLE_PRECISION_COMPLEX
282 392601600 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
283 392601600 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
284 : #endif
285 : #ifdef SINGLE_PRECISION_COMPLEX
286 392601600 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
287 392601600 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
288 : #endif
289 :
290 : #ifndef __ELPA_USE_FMA__
291 : // conjugate
292 294451200 : h1_imag = _SSE_XOR(h1_imag, sign);
293 : #endif
294 :
295 294451200 : tmp1 = _SSE_MUL(h1_imag, q1);
296 : #ifdef __ELPA_USE_FMA__
297 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
298 : #else
299 1177804800 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
300 : #endif
301 294451200 : tmp2 = _SSE_MUL(h1_imag, q2);
302 : #ifdef __ELPA_USE_FMA__
303 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
304 : #else
305 1177804800 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
306 : #endif
307 :
308 : #ifdef DOUBLE_PRECISION_COMPLEX
309 196300800 : tmp3 = _SSE_MUL(h1_imag, q3);
310 : #ifdef __ELPA_USE_FMA__
311 : x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
312 : #else
313 785203200 : x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
314 : #endif
315 196300800 : tmp4 = _SSE_MUL(h1_imag, q4);
316 : #ifdef __ELPA_USE_FMA__
317 : x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
318 : #else
319 785203200 : x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
320 : #endif
321 : #endif /* DOUBLE_PRECISION_COMPLEX */
322 :
323 : #ifdef DOUBLE_PRECISION_COMPLEX
324 392601600 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
325 392601600 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
326 : #endif
327 : #ifdef SINGLE_PRECISION_COMPLEX
328 392601600 : h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
329 392601600 : h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
330 : #endif
331 :
332 : #ifndef __ELPA_USE_FMA__
333 : // conjugate
334 294451200 : h2_imag = _SSE_XOR(h2_imag, sign);
335 : #endif
336 :
337 294451200 : tmp1 = _SSE_MUL(h2_imag, q1);
338 : #ifdef __ELPA_USE_FMA__
339 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
340 : #else
341 1177804800 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
342 : #endif
343 294451200 : tmp2 = _SSE_MUL(h2_imag, q2);
344 : #ifdef __ELPA_USE_FMA__
345 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
346 : #else
347 1177804800 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
348 : #endif
349 :
350 : #ifdef DOUBLE_PRECISION_COMPLEX
351 196300800 : tmp3 = _SSE_MUL(h2_imag, q3);
352 : #ifdef __ELPA_USE_FMA__
353 : y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
354 : #else
355 785203200 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
356 : #endif
357 196300800 : tmp4 = _SSE_MUL(h2_imag, q4);
358 : #ifdef __ELPA_USE_FMA__
359 : y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
360 : #else
361 785203200 : y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
362 : #endif
363 : #endif /* DOUBLE_PRECISION_COMPLEX */
364 : }
365 :
366 : #ifdef DOUBLE_PRECISION_COMPLEX
367 13086720 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
368 13086720 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
369 : #endif
370 : #ifdef SINGLE_PRECISION_COMPLEX
371 13086720 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
372 13086720 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
373 : #endif
374 :
375 : #ifndef __ELPA_USE_FMA__
376 : // conjugate
377 9815040 : h1_imag = _SSE_XOR(h1_imag, sign);
378 : #endif
379 :
380 19630080 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
381 19630080 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
382 : #ifdef DOUBLE_PRECISION_COMPLEX
383 13086720 : q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
384 13086720 : q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
385 : #endif
386 :
387 9815040 : tmp1 = _SSE_MUL(h1_imag, q1);
388 : #ifdef __ELPA_USE_FMA__
389 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
390 : #else
391 39260160 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
392 : #endif
393 9815040 : tmp2 = _SSE_MUL(h1_imag, q2);
394 : #ifdef __ELPA_USE_FMA__
395 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
396 : #else
397 39260160 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
398 : #endif
399 :
400 : #ifdef DOUBLE_PRECISION_COMPLEX
401 6543360 : tmp3 = _SSE_MUL(h1_imag, q3);
402 : #ifdef __ELPA_USE_FMA__
403 : x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
404 : #else
405 26173440 : x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
406 : #endif
407 6543360 : tmp4 = _SSE_MUL(h1_imag, q4);
408 : #ifdef __ELPA_USE_FMA__
409 : x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
410 : #else
411 26173440 : x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
412 : #endif
413 : #endif /* DOUBLE_PRECISION_COMPLEX */
414 :
415 : #ifdef DOUBLE_PRECISION_COMPLEX
416 6543360 : h1_real = _mm_loaddup_pd(&hh_dbl[0]);
417 13086720 : h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
418 : #endif
419 : #ifdef SINGLE_PRECISION_COMPLEX
420 9815040 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
421 13086720 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
422 : #endif
423 :
424 9815040 : h1_real = _SSE_XOR(h1_real, sign);
425 9815040 : h1_imag = _SSE_XOR(h1_imag, sign);
426 :
427 9815040 : tmp1 = _SSE_MUL(h1_imag, x1);
428 :
429 : #ifdef __ELPA_USE_FMA__
430 : x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
431 : #else
432 29445120 : x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
433 : #endif
434 9815040 : tmp2 = _SSE_MUL(h1_imag, x2);
435 : #ifdef __ELPA_USE_FMA__
436 : x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
437 : #else
438 29445120 : x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
439 : #endif
440 :
441 : #ifdef DOUBLE_PRECISION_COMPLEX
442 6543360 : tmp3 = _SSE_MUL(h1_imag, x3);
443 : #ifdef __ELPA_USE_FMA__
444 : x3 = _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
445 : #else
446 19630080 : x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
447 : #endif
448 6543360 : tmp4 = _SSE_MUL(h1_imag, x4);
449 : #ifdef __ELPA_USE_FMA__
450 : x4 = _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
451 : #else
452 19630080 : x4 = _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
453 : #endif
454 : #endif /* DOUBLE_PRECISION_COMPLEX */
455 :
456 : #ifdef DOUBLE_PRECISION_COMPLEX
457 13086720 : h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
458 13086720 : h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
459 : #endif
460 : #ifdef SINGLE_PRECISION_COMPLEX
461 13086720 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
462 13086720 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
463 : #endif
464 :
465 : #ifdef DOUBLE_PRECISION_COMPLEX
466 13086720 : h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
467 13086720 : h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
468 : #endif
469 : #ifdef SINGLE_PRECISION_COMPLEX
470 13086720 : h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
471 13086720 : h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
472 : #endif
473 :
474 9815040 : h1_real = _SSE_XOR(h1_real, sign);
475 9815040 : h1_imag = _SSE_XOR(h1_imag, sign);
476 9815040 : h2_real = _SSE_XOR(h2_real, sign);
477 9815040 : h2_imag = _SSE_XOR(h2_imag, sign);
478 :
479 : #ifdef SINGLE_PRECISION_COMPLEX
480 6543360 : tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
481 : #else
482 6543360 : tmp2 = _SSE_LOADU(s_dbl);
483 : #endif
484 16358400 : tmp1 = _SSE_MUL(h2_imag, tmp2);
485 :
486 : #ifdef __ELPA_USE_FMA__
487 : tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
488 : #else
489 29445120 : tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
490 : #endif
491 :
492 : #ifdef DOUBLE_PRECISION_COMPLEX
493 13086720 : h2_real = _mm_movedup_pd(tmp2);
494 13086720 : h2_imag = _mm_set1_pd(tmp2[1]);
495 : #endif
496 : #ifdef SINGLE_PRECISION_COMPLEX
497 3271680 : h2_real = _mm_moveldup_ps(tmp2);
498 3271680 : h2_imag = _mm_movehdup_ps(tmp2);
499 : #endif
500 :
501 9815040 : tmp1 = _SSE_MUL(h1_imag, y1);
502 : #ifdef __ELPA_USE_FMA__
503 : y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
504 : #else
505 29445120 : y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
506 : #endif
507 9815040 : tmp2 = _SSE_MUL(h1_imag, y2);
508 : #ifdef __ELPA_USE_FMA__
509 : y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
510 : #else
511 29445120 : y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
512 : #endif
513 :
514 : #ifdef DOUBLE_PRECISION_COMPLEX
515 6543360 : tmp3 = _SSE_MUL(h1_imag, y3);
516 : #ifdef __ELPA_USE_FMA__
517 : y3 = _mm_maddsub_pd(h1_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
518 : #else
519 19630080 : y3 = _SSE_ADDSUB( _SSE_MUL(h1_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
520 : #endif
521 6543360 : tmp4 = _SSE_MUL(h1_imag, y4);
522 : #ifdef __ELPA_USE_FMA__
523 : y4 = _mm_maddsub_pd(h1_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
524 : #else
525 19630080 : y4 = _SSE_ADDSUB( _SSE_MUL(h1_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
526 : #endif
527 : #endif /* DOUBLE_PRECISION_COMPLEX */
528 :
529 9815040 : tmp1 = _SSE_MUL(h2_imag, x1);
530 : #ifdef __ELPA_USE_FMA__
531 : y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
532 : #else
533 39260160 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
534 : #endif
535 9815040 : tmp2 = _SSE_MUL(h2_imag, x2);
536 : #ifdef __ELPA_USE_FMA__
537 : y2 = _SSE_ADD(y2, _mm_maddsub_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
538 : #else
539 39260160 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
540 : #endif
541 :
542 : #ifdef DOUBLE_PRECISION_COMPLEX
543 6543360 : tmp3 = _SSE_MUL(h2_imag, x3);
544 : #ifdef __ELPA_USE_FMA__
545 : y3 = _SSE_ADD(y3, _mm_maddsub_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
546 : #else
547 26173440 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
548 : #endif
549 6543360 : tmp4 = _SSE_MUL(h2_imag, x4);
550 : #ifdef __ELPA_USE_FMA__
551 : y4 = _SSE_ADD(y4, _mm_maddsub_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
552 : #else
553 26173440 : y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
554 : #endif
555 : #endif /* DOUBLE_PRECISION_COMPLEX */
556 :
557 9815040 : q1 = _SSE_LOAD(&q_dbl[0]);
558 19630080 : q2 = _SSE_LOAD(&q_dbl[offset]);
559 : #ifdef DOUBLE_PRECISION_COMPLEX
560 13086720 : q3 = _SSE_LOAD(&q_dbl[2*offset]);
561 13086720 : q4 = _SSE_LOAD(&q_dbl[3*offset]);
562 : #endif
563 9815040 : q1 = _SSE_ADD(q1, y1);
564 9815040 : q2 = _SSE_ADD(q2, y2);
565 : #ifdef DOUBLE_PRECISION_COMPLEX
566 6543360 : q3 = _SSE_ADD(q3, y3);
567 6543360 : q4 = _SSE_ADD(q4, y4);
568 : #endif
569 : _SSE_STORE(&q_dbl[0], q1);
570 9815040 : _SSE_STORE(&q_dbl[offset], q2);
571 : #ifdef DOUBLE_PRECISION_COMPLEX
572 6543360 : _SSE_STORE(&q_dbl[2*offset], q3);
573 6543360 : _SSE_STORE(&q_dbl[3*offset], q4);
574 : #endif
575 :
576 : #ifdef DOUBLE_PRECISION_COMPLEX
577 13086720 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
578 13086720 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
579 : #endif
580 : #ifdef SINGLE_PRECISION_COMPLEX
581 13086720 : h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
582 13086720 : h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
583 : #endif
584 :
585 19630080 : q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
586 19630080 : q2 = _SSE_LOAD(&q_dbl[(ldq*2)+offset]);
587 : #ifdef DOUBLE_PRECISION_COMPLEX
588 13086720 : q3 = _SSE_LOAD(&q_dbl[(ldq*2)+2*offset]);
589 13086720 : q4 = _SSE_LOAD(&q_dbl[(ldq*2)+3*offset]);
590 : #endif
591 9815040 : q1 = _SSE_ADD(q1, x1);
592 9815040 : q2 = _SSE_ADD(q2, x2);
593 : #ifdef DOUBLE_PRECISION_COMPLEX
594 6543360 : q3 = _SSE_ADD(q3, x3);
595 6543360 : q4 = _SSE_ADD(q4, x4);
596 : #endif
597 9815040 : tmp1 = _SSE_MUL(h2_imag, y1);
598 :
599 : #ifdef __ELPA_USE_FMA__
600 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
601 : #else
602 39260160 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
603 : #endif
604 9815040 : tmp2 = _SSE_MUL(h2_imag, y2);
605 : #ifdef __ELPA_USE_FMA__
606 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
607 : #else
608 39260160 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
609 : #endif
610 :
611 : #ifdef DOUBLE_PRECISION_COMPLEX
612 6543360 : tmp3 = _SSE_MUL(h2_imag, y3);
613 : #ifdef __ELPA_USE_FMA__
614 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
615 : #else
616 26173440 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
617 : #endif
618 6543360 : tmp4 = _SSE_MUL(h2_imag, y4);
619 : #ifdef __ELPA_USE_FMA__
620 : q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
621 : #else
622 26173440 : q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
623 : #endif
624 : #endif /* DOUBLE_PRECISION_COMPLEX */
625 :
626 9815040 : _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
627 9815040 : _SSE_STORE(&q_dbl[(ldq*2)+offset], q2);
628 : #ifdef DOUBLE_PRECISION_COMPLEX
629 6543360 : _SSE_STORE(&q_dbl[(ldq*2)+2*offset], q3);
630 6543360 : _SSE_STORE(&q_dbl[(ldq*2)+3*offset], q4);
631 : #endif
632 :
633 304266240 : for (i = 2; i < nb; i++)
634 : {
635 588902400 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
636 588902400 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
637 : #ifdef DOUBLE_PRECISION_COMPLEX
638 392601600 : q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
639 392601600 : q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
640 : #endif
641 : #ifdef DOUBLE_PRECISION_COMPLEX
642 392601600 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
643 392601600 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
644 : #endif
645 : #ifdef SINGLE_PRECISION_COMPLEX
646 392601600 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
647 392601600 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
648 : #endif
649 :
650 294451200 : tmp1 = _SSE_MUL(h1_imag, x1);
651 : #ifdef __ELPA_USE_FMA__
652 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
653 : #else
654 1177804800 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
655 : #endif
656 294451200 : tmp2 = _SSE_MUL(h1_imag, x2);
657 : #ifdef __ELPA_USE_FMA__
658 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
659 : #else
660 1177804800 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
661 : #endif
662 :
663 : #ifdef DOUBLE_PRECISION_COMPLEX
664 196300800 : tmp3 = _SSE_MUL(h1_imag, x3);
665 : #ifdef __ELPA_USE_FMA__
666 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
667 : #else
668 785203200 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
669 : #endif
670 196300800 : tmp4 = _SSE_MUL(h1_imag, x4);
671 : #ifdef __ELPA_USE_FMA__
672 : q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
673 : #else
674 785203200 : q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
675 : #endif
676 : #endif /* DOUBLE_PRECISION_COMPLEX */
677 :
678 : #ifdef DOUBLE_PRECISION_COMPLEX
679 392601600 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
680 392601600 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
681 : #endif
682 : #ifdef SINGLE_PRECISION_COMPLEX
683 392601600 : h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
684 392601600 : h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
685 : #endif
686 :
687 294451200 : tmp1 = _SSE_MUL(h2_imag, y1);
688 : #ifdef __ELPA_USE_FMA__
689 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
690 : #else
691 1177804800 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
692 : #endif
693 294451200 : tmp2 = _SSE_MUL(h2_imag, y2);
694 : #ifdef __ELPA_USE_FMA__
695 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
696 : #else
697 1177804800 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
698 : #endif
699 :
700 : #ifdef DOUBLE_PRECISION_COMPLEX
701 196300800 : tmp3 = _SSE_MUL(h2_imag, y3);
702 : #ifdef __ELPA_USE_FMA__
703 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
704 : #else
705 785203200 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
706 : #endif
707 196300800 : tmp4 = _SSE_MUL(h2_imag, y4);
708 : #ifdef __ELPA_USE_FMA__
709 : q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
710 : #else
711 785203200 : q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
712 : #endif
713 : #endif /* DOUBLE_PRECISION_COMPLEX */
714 :
715 294451200 : _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
716 294451200 : _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
717 : #ifdef DOUBLE_PRECISION_COMPLEX
718 196300800 : _SSE_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
719 196300800 : _SSE_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
720 : #endif
721 : }
722 :
723 : #ifdef DOUBLE_PRECISION_COMPLEX
724 13086720 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
725 13086720 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
726 : #endif
727 : #ifdef SINGLE_PRECISION_COMPLEX
728 13086720 : h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
729 13086720 : h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
730 : #endif
731 :
732 :
733 19630080 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
734 19630080 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
735 : #ifdef DOUBLE_PRECISION_COMPLEX
736 13086720 : q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
737 13086720 : q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
738 : #endif
739 :
740 9815040 : tmp1 = _SSE_MUL(h1_imag, x1);
741 : #ifdef __ELPA_USE_FMA__
742 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
743 : #else
744 39260160 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
745 : #endif
746 9815040 : tmp2 = _SSE_MUL(h1_imag, x2);
747 : #ifdef __ELPA_USE_FMA__
748 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
749 : #else
750 39260160 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
751 : #endif
752 :
753 : #ifdef DOUBLE_PRECISION_COMPLEX
754 6543360 : tmp3 = _SSE_MUL(h1_imag, x3);
755 : #ifdef __ELPA_USE_FMA__
756 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
757 : #else
758 26173440 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
759 : #endif
760 6543360 : tmp4 = _SSE_MUL(h1_imag, x4);
761 : #ifdef __ELPA_USE_FMA__
762 : q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
763 : #else
764 26173440 : q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
765 : #endif
766 : #endif /* DOUBLE_PRECISION_COMPLEX */
767 :
768 :
769 9815040 : _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
770 9815040 : _SSE_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
771 : #ifdef DOUBLE_PRECISION_COMPLEX
772 6543360 : _SSE_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
773 6543360 : _SSE_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
774 : #endif
775 : }
776 :
777 : #if 0
778 :
779 : static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
780 : {
781 : double* q_dbl = (double*)q;
782 : double* hh_dbl = (double*)hh;
783 : double* s_dbl = (double*)(&s);
784 :
785 : __SSE_DATATYPE x1, x2, x3;
786 : __SSE_DATATYPE y1, y2, y3;
787 : __SSE_DATATYPE q1, q2, q3;
788 : __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
789 : __SSE_DATATYPE tmp1, tmp2, tmp3;
790 : int i=0;
791 :
792 : __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
793 :
794 : x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
795 : x2 = _SSE_LOAD(&q_dbl[(2*ldq)+2]);
796 : x3 = _SSE_LOAD(&q_dbl[(2*ldq)+4]);
797 :
798 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
799 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
800 : #ifndef __ELPA_USE_FMA__
801 : // conjugate
802 : h2_imag = _SSE_XOR(h2_imag, sign);
803 : #endif
804 :
805 : y1 = _SSE_LOAD(&q_dbl[0]);
806 : y2 = _SSE_LOAD(&q_dbl[2]);
807 : y3 = _SSE_LOAD(&q_dbl[4]);
808 :
809 : tmp1 = _SSE_MUL(h2_imag, x1);
810 : #ifdef __ELPA_USE_FMA__
811 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
812 : #else
813 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
814 : #endif
815 : tmp2 = _SSE_MUL(h2_imag, x2);
816 : #ifdef __ELPA_USE_FMA__
817 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
818 : #else
819 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
820 : #endif
821 : tmp3 = _SSE_MUL(h2_imag, x3);
822 : #ifdef __ELPA_USE_FMA__
823 : y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
824 : #else
825 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
826 : #endif
827 :
828 : for (i = 2; i < nb; i++)
829 : {
830 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
831 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2]);
832 : q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+4]);
833 :
834 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
835 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
836 : #ifndef __ELPA_USE_FMA__
837 : // conjugate
838 : h1_imag = _SSE_XOR(h1_imag, sign);
839 : #endif
840 :
841 : tmp1 = _SSE_MUL(h1_imag, q1);
842 : #ifdef __ELPA_USE_FMA__
843 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
844 : #else
845 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
846 : #endif
847 : tmp2 = _SSE_MUL(h1_imag, q2);
848 : #ifdef __ELPA_USE_FMA__
849 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
850 : #else
851 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
852 : #endif
853 : tmp3 = _SSE_MUL(h1_imag, q3);
854 : #ifdef __ELPA_USE_FMA__
855 : x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
856 : #else
857 : x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
858 : #endif
859 :
860 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
861 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
862 : #ifndef __ELPA_USE_FMA__
863 : // conjugate
864 : h2_imag = _SSE_XOR(h2_imag, sign);
865 : #endif
866 :
867 : tmp1 = _SSE_MUL(h2_imag, q1);
868 : #ifdef __ELPA_USE_FMA__
869 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
870 : #else
871 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
872 : #endif
873 : tmp2 = _SSE_MUL(h2_imag, q2);
874 : #ifdef __ELPA_USE_FMA__
875 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
876 : #else
877 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
878 : #endif
879 : tmp3 = _SSE_MUL(h2_imag, q3);
880 : #ifdef __ELPA_USE_FMA__
881 : y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
882 : #else
883 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
884 : #endif
885 : }
886 :
887 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
888 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
889 : #ifndef __ELPA_USE_FMA__
890 : // conjugate
891 : h1_imag = _SSE_XOR(h1_imag, sign);
892 : #endif
893 :
894 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
895 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2]);
896 : q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+4]);
897 :
898 : tmp1 = _SSE_MUL(h1_imag, q1);
899 : #ifdef __ELPA_USE_FMA__
900 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
901 : #else
902 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
903 : #endif
904 : tmp2 = _SSE_MUL(h1_imag, q2);
905 : #ifdef __ELPA_USE_FMA__
906 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
907 : #else
908 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
909 : #endif
910 : tmp3 = _SSE_MUL(h1_imag, q3);
911 : #ifdef __ELPA_USE_FMA__
912 : x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
913 : #else
914 : x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
915 : #endif
916 :
917 : h1_real = _mm_loaddup_pd(&hh_dbl[0]);
918 : h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
919 : h1_real = _SSE_XOR(h1_real, sign);
920 : h1_imag = _SSE_XOR(h1_imag, sign);
921 :
922 : tmp1 = _SSE_MUL(h1_imag, x1);
923 : #ifdef __ELPA_USE_FMA__
924 : x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
925 : #else
926 : x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
927 : #endif
928 : tmp2 = _SSE_MUL(h1_imag, x2);
929 : #ifdef __ELPA_USE_FMA__
930 : x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
931 : #else
932 : x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
933 : #endif
934 : tmp3 = _SSE_MUL(h1_imag, x3);
935 : #ifdef __ELPA_USE_FMA__
936 : x3 = _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
937 : #else
938 : x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
939 : #endif
940 :
941 : h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
942 : h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
943 : h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
944 : h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
945 :
946 : h1_real = _SSE_XOR(h1_real, sign);
947 : h1_imag = _SSE_XOR(h1_imag, sign);
948 : h2_real = _SSE_XOR(h2_real, sign);
949 : h2_imag = _SSE_XOR(h2_imag, sign);
950 :
951 : tmp2 = _SSE_LOADU(s_dbl);
952 : tmp1 = _SSE_MUL(h2_imag, tmp2);
953 : #ifdef __ELPA_USE_FMA__
954 : tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
955 : #else
956 : tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
957 : #endif
958 : _SSE_STOREU(s_dbl, tmp2);
959 : h2_real = _mm_set1_pd(s_dbl[0]);
960 : h2_imag = _mm_set1_pd(s_dbl[1]);
961 :
962 : // h2_real = _mm_loaddup_pd(&s_dbl[0]);
963 : // h2_imag = _mm_loaddup_pd(&s_dbl[1]);
964 :
965 : tmp1 = _SSE_MUL(h1_imag, y1);
966 : #ifdef __ELPA_USE_FMA__
967 : y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
968 : #else
969 : y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
970 : #endif
971 : tmp2 = _SSE_MUL(h1_imag, y2);
972 : #ifdef __ELPA_USE_FMA__
973 : y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
974 : #else
975 : y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
976 : #endif
977 : tmp3 = _SSE_MUL(h1_imag, y3);
978 : #ifdef __ELPA_USE_FMA__
979 : y3 = _mm_maddsub_pd(h1_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
980 : #else
981 : y3 = _SSE_ADDSUB( _SSE_MUL(h1_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
982 : #endif
983 :
984 : tmp1 = _SSE_MUL(h2_imag, x1);
985 : #ifdef __ELPA_USE_FMA__
986 : y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
987 : #else
988 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
989 : #endif
990 : tmp2 = _SSE_MUL(h2_imag, x2);
991 : #ifdef __ELPA_USE_FMA__
992 : y2 = _SSE_ADD(y2, _mm_maddsub_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
993 : #else
994 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
995 : #endif
996 : tmp3 = _SSE_MUL(h2_imag, x3);
997 : #ifdef __ELPA_USE_FMA__
998 : y3 = _SSE_ADD(y3, _mm_maddsub_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
999 : #else
1000 : y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1001 : #endif
1002 :
1003 : q1 = _SSE_LOAD(&q_dbl[0]);
1004 : q2 = _SSE_LOAD(&q_dbl[2]);
1005 : q3 = _SSE_LOAD(&q_dbl[4]);
1006 :
1007 : q1 = _SSE_ADD(q1, y1);
1008 : q2 = _SSE_ADD(q2, y2);
1009 : q3 = _SSE_ADD(q3, y3);
1010 :
1011 : _SSE_STORE(&q_dbl[0], q1);
1012 : _SSE_STORE(&q_dbl[2], q2);
1013 : _SSE_STORE(&q_dbl[4], q3);
1014 :
1015 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1016 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1017 :
1018 : q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
1019 : q2 = _SSE_LOAD(&q_dbl[(ldq*2)+2]);
1020 : q3 = _SSE_LOAD(&q_dbl[(ldq*2)+4]);
1021 :
1022 : q1 = _SSE_ADD(q1, x1);
1023 : q2 = _SSE_ADD(q2, x2);
1024 : q3 = _SSE_ADD(q3, x3);
1025 :
1026 : tmp1 = _SSE_MUL(h2_imag, y1);
1027 : #ifdef __ELPA_USE_FMA__
1028 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1029 : #else
1030 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1031 : #endif
1032 : tmp2 = _SSE_MUL(h2_imag, y2);
1033 : #ifdef __ELPA_USE_FMA__
1034 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1035 : #else
1036 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1037 : #endif
1038 : tmp3 = _SSE_MUL(h2_imag, y3);
1039 : #ifdef __ELPA_USE_FMA__
1040 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1041 : #else
1042 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1043 : #endif
1044 :
1045 : _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
1046 : _SSE_STORE(&q_dbl[(ldq*2)+2], q2);
1047 : _SSE_STORE(&q_dbl[(ldq*2)+4], q3);
1048 :
1049 : for (i = 2; i < nb; i++)
1050 : {
1051 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
1052 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2]);
1053 : q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+4]);
1054 :
1055 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
1056 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
1057 :
1058 : tmp1 = _SSE_MUL(h1_imag, x1);
1059 : #ifdef __ELPA_USE_FMA__
1060 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1061 : #else
1062 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1063 : #endif
1064 : tmp2 = _SSE_MUL(h1_imag, x2);
1065 : #ifdef __ELPA_USE_FMA__
1066 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1067 : #else
1068 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1069 : #endif
1070 : tmp3 = _SSE_MUL(h1_imag, x3);
1071 : #ifdef __ELPA_USE_FMA__
1072 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1073 : #else
1074 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1075 : #endif
1076 :
1077 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1078 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1079 :
1080 : tmp1 = _SSE_MUL(h2_imag, y1);
1081 : #ifdef __ELPA_USE_FMA__
1082 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1083 : #else
1084 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1085 : #endif
1086 : tmp2 = _SSE_MUL(h2_imag, y2);
1087 : #ifdef __ELPA_USE_FMA__
1088 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1089 : #else
1090 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1091 : #endif
1092 : tmp3 = _SSE_MUL(h2_imag, y3);
1093 : #ifdef __ELPA_USE_FMA__
1094 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1095 : #else
1096 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1097 : #endif
1098 :
1099 : _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
1100 : _SSE_STORE(&q_dbl[(2*i*ldq)+2], q2);
1101 : _SSE_STORE(&q_dbl[(2*i*ldq)+4], q3);
1102 : }
1103 :
1104 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1105 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1106 :
1107 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
1108 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2]);
1109 : q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+4]);
1110 :
1111 : tmp1 = _SSE_MUL(h1_imag, x1);
1112 : #ifdef __ELPA_USE_FMA__
1113 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1114 : #else
1115 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1116 : #endif
1117 : tmp2 = _SSE_MUL(h1_imag, x2);
1118 : #ifdef __ELPA_USE_FMA__
1119 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1120 : #else
1121 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1122 : #endif
1123 : tmp3 = _SSE_MUL(h1_imag, x3);
1124 : #ifdef __ELPA_USE_FMA__
1125 : q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1126 : #else
1127 : q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1128 : #endif
1129 :
1130 : _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1131 : _SSE_STORE(&q_dbl[(2*nb*ldq)+2], q2);
1132 : _SSE_STORE(&q_dbl[(2*nb*ldq)+4], q3);
1133 : }
1134 :
1135 : #endif
1136 :
1137 : #if 0
1138 : static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
1139 : {
1140 : double* q_dbl = (double*)q;
1141 : double* hh_dbl = (double*)hh;
1142 : double* s_dbl = (double*)(&s);
1143 :
1144 : __SSE_DATATYPE x1, x2;
1145 : __SSE_DATATYPE y1, y2;
1146 : __SSE_DATATYPE q1, q2;
1147 : __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
1148 : __SSE_DATATYPE tmp1, tmp2;
1149 : int i=0;
1150 :
1151 : __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
1152 :
1153 : x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
1154 : x2 = _SSE_LOAD(&q_dbl[(2*ldq)+2]);
1155 :
1156 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1157 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1158 : #ifndef __ELPA_USE_FMA__
1159 : // conjugate
1160 : h2_imag = _SSE_XOR(h2_imag, sign);
1161 : #endif
1162 :
1163 : y1 = _SSE_LOAD(&q_dbl[0]);
1164 : y2 = _SSE_LOAD(&q_dbl[2]);
1165 :
1166 : tmp1 = _SSE_MUL(h2_imag, x1);
1167 : #ifdef __ELPA_USE_FMA__
1168 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1169 : #else
1170 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1171 : #endif
1172 : tmp2 = _SSE_MUL(h2_imag, x2);
1173 : #ifdef __ELPA_USE_FMA__
1174 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1175 : #else
1176 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1177 : #endif
1178 :
1179 : for (i = 2; i < nb; i++)
1180 : {
1181 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
1182 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2]);
1183 :
1184 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
1185 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
1186 : #ifndef __ELPA_USE_FMA__
1187 : // conjugate
1188 : h1_imag = _SSE_XOR(h1_imag, sign);
1189 : #endif
1190 :
1191 : tmp1 = _SSE_MUL(h1_imag, q1);
1192 : #ifdef __ELPA_USE_FMA__
1193 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1194 : #else
1195 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1196 : #endif
1197 : tmp2 = _SSE_MUL(h1_imag, q2);
1198 : #ifdef __ELPA_USE_FMA__
1199 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1200 : #else
1201 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1202 : #endif
1203 :
1204 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1205 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1206 : #ifndef __ELPA_USE_FMA__
1207 : // conjugate
1208 : h2_imag = _SSE_XOR(h2_imag, sign);
1209 : #endif
1210 :
1211 : tmp1 = _SSE_MUL(h2_imag, q1);
1212 : #ifdef __ELPA_USE_FMA__
1213 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1214 : #else
1215 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1216 : #endif
1217 : tmp2 = _SSE_MUL(h2_imag, q2);
1218 : #ifdef __ELPA_USE_FMA__
1219 : y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1220 : #else
1221 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1222 : #endif
1223 : }
1224 :
1225 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1226 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1227 : #ifndef __ELPA_USE_FMA__
1228 : // conjugate
1229 : h1_imag = _SSE_XOR(h1_imag, sign);
1230 : #endif
1231 :
1232 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
1233 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2]);
1234 :
1235 : tmp1 = _SSE_MUL(h1_imag, q1);
1236 : #ifdef __ELPA_USE_FMA__
1237 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1238 : #else
1239 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1240 : #endif
1241 : tmp2 = _SSE_MUL(h1_imag, q2);
1242 : #ifdef __ELPA_USE_FMA__
1243 : x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1244 : #else
1245 : x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1246 : #endif
1247 :
1248 : h1_real = _mm_loaddup_pd(&hh_dbl[0]);
1249 : h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
1250 : h1_real = _SSE_XOR(h1_real, sign);
1251 : h1_imag = _SSE_XOR(h1_imag, sign);
1252 :
1253 : tmp1 = _SSE_MUL(h1_imag, x1);
1254 : #ifdef __ELPA_USE_FMA__
1255 : x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1256 : #else
1257 : x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1258 : #endif
1259 : tmp2 = _SSE_MUL(h1_imag, x2);
1260 : #ifdef __ELPA_USE_FMA__
1261 : x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1262 : #else
1263 : x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1264 : #endif
1265 :
1266 : h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1267 : h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1268 : h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1269 : h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1270 :
1271 : h1_real = _SSE_XOR(h1_real, sign);
1272 : h1_imag = _SSE_XOR(h1_imag, sign);
1273 : h2_real = _SSE_XOR(h2_real, sign);
1274 : h2_imag = _SSE_XOR(h2_imag, sign);
1275 :
1276 : tmp2 = _SSE_LOADU(s_dbl);
1277 : tmp1 = _SSE_MUL(h2_imag, tmp2);
1278 : #ifdef __ELPA_USE_FMA__
1279 : tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1280 : #else
1281 : tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1282 : #endif
1283 : _SSE_STOREU(s_dbl, tmp2);
1284 : h2_real = _mm_set1_pd(s_dbl[0]);
1285 : h2_imag = _mm_set1_pd(s_dbl[1]);
1286 :
1287 : // h2_real = _mm_loaddup_pd(&s_dbl[0]);
1288 : // h2_imag = _mm_loaddup_pd(&s_dbl[1]);
1289 :
1290 : tmp1 = _SSE_MUL(h1_imag, y1);
1291 : #ifdef __ELPA_USE_FMA__
1292 : y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1293 : #else
1294 : y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1295 : #endif
1296 : tmp2 = _SSE_MUL(h1_imag, y2);
1297 : #ifdef __ELPA_USE_FMA__
1298 : y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1299 : #else
1300 : y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1301 : #endif
1302 :
1303 : tmp1 = _SSE_MUL(h2_imag, x1);
1304 : #ifdef __ELPA_USE_FMA__
1305 : y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1306 : #else
1307 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1308 : #endif
1309 : tmp2 = _SSE_MUL(h2_imag, x2);
1310 : #ifdef __ELPA_USE_FMA__
1311 : y2 = _SSE_ADD(y2, _mm_maddsub_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1312 : #else
1313 : y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1314 : #endif
1315 :
1316 : q1 = _SSE_LOAD(&q_dbl[0]);
1317 : q2 = _SSE_LOAD(&q_dbl[2]);
1318 :
1319 : q1 = _SSE_ADD(q1, y1);
1320 : q2 = _SSE_ADD(q2, y2);
1321 :
1322 : _SSE_STORE(&q_dbl[0], q1);
1323 : _SSE_STORE(&q_dbl[2], q2);
1324 :
1325 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1326 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1327 :
1328 : q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
1329 : q2 = _SSE_LOAD(&q_dbl[(ldq*2)+2]);
1330 :
1331 : q1 = _SSE_ADD(q1, x1);
1332 : q2 = _SSE_ADD(q2, x2);
1333 :
1334 : tmp1 = _SSE_MUL(h2_imag, y1);
1335 : #ifdef __ELPA_USE_FMA__
1336 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1337 : #else
1338 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1339 : #endif
1340 : tmp2 = _SSE_MUL(h2_imag, y2);
1341 : #ifdef __ELPA_USE_FMA__
1342 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1343 : #else
1344 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1345 : #endif
1346 :
1347 : _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
1348 : _SSE_STORE(&q_dbl[(ldq*2)+2], q2);
1349 :
1350 : for (i = 2; i < nb; i++)
1351 : {
1352 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
1353 : q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2]);
1354 :
1355 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
1356 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
1357 :
1358 : tmp1 = _SSE_MUL(h1_imag, x1);
1359 : #ifdef __ELPA_USE_FMA__
1360 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1361 : #else
1362 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1363 : #endif
1364 : tmp2 = _SSE_MUL(h1_imag, x2);
1365 : #ifdef __ELPA_USE_FMA__
1366 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1367 : #else
1368 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1369 : #endif
1370 :
1371 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1372 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1373 :
1374 : tmp1 = _SSE_MUL(h2_imag, y1);
1375 : #ifdef __ELPA_USE_FMA__
1376 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1377 : #else
1378 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1379 : #endif
1380 : tmp2 = _SSE_MUL(h2_imag, y2);
1381 : #ifdef __ELPA_USE_FMA__
1382 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1383 : #else
1384 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1385 : #endif
1386 :
1387 : _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
1388 : _SSE_STORE(&q_dbl[(2*i*ldq)+2], q2);
1389 : }
1390 :
1391 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1392 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1393 :
1394 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
1395 : q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2]);
1396 :
1397 : tmp1 = _SSE_MUL(h1_imag, x1);
1398 : #ifdef __ELPA_USE_FMA__
1399 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1400 : #else
1401 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1402 : #endif
1403 : tmp2 = _SSE_MUL(h1_imag, x2);
1404 : #ifdef __ELPA_USE_FMA__
1405 : q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1406 : #else
1407 : q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1408 : #endif
1409 :
1410 : _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1411 : _SSE_STORE(&q_dbl[(2*nb*ldq)+2], q2);
1412 : }
1413 :
1414 : static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
1415 : {
1416 : double* q_dbl = (double*)q;
1417 : double* hh_dbl = (double*)hh;
1418 : double* s_dbl = (double*)(&s);
1419 :
1420 : __SSE_DATATYPE x1;
1421 : __SSE_DATATYPE y1;
1422 : __SSE_DATATYPE q1;
1423 : __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
1424 : __SSE_DATATYPE tmp1;
1425 : int i=0;
1426 :
1427 : __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
1428 :
1429 : x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
1430 :
1431 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1432 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1433 : #ifndef __ELPA_USE_FMA__
1434 : // conjugate
1435 : h2_imag = _SSE_XOR(h2_imag, sign);
1436 : #endif
1437 :
1438 : y1 = _SSE_LOAD(&q_dbl[0]);
1439 :
1440 : tmp1 = _SSE_MUL(h2_imag, x1);
1441 : #ifdef __ELPA_USE_FMA__
1442 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1443 : #else
1444 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1445 : #endif
1446 :
1447 : for (i = 2; i < nb; i++)
1448 : {
1449 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
1450 :
1451 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
1452 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
1453 : #ifndef __ELPA_USE_FMA__
1454 : // conjugate
1455 : h1_imag = _SSE_XOR(h1_imag, sign);
1456 : #endif
1457 :
1458 : tmp1 = _SSE_MUL(h1_imag, q1);
1459 : #ifdef __ELPA_USE_FMA__
1460 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1461 : #else
1462 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1463 : #endif
1464 :
1465 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1466 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1467 : #ifndef __ELPA_USE_FMA__
1468 : // conjugate
1469 : h2_imag = _SSE_XOR(h2_imag, sign);
1470 : #endif
1471 :
1472 : tmp1 = _SSE_MUL(h2_imag, q1);
1473 : #ifdef __ELPA_USE_FMA__
1474 : y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1475 : #else
1476 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1477 : #endif
1478 : }
1479 :
1480 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1481 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1482 : #ifndef __ELPA_USE_FMA__
1483 : // conjugate
1484 : h1_imag = _SSE_XOR(h1_imag, sign);
1485 : #endif
1486 :
1487 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
1488 :
1489 : tmp1 = _SSE_MUL(h1_imag, q1);
1490 : #ifdef __ELPA_USE_FMA__
1491 : x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1492 : #else
1493 : x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1494 : #endif
1495 :
1496 : h1_real = _mm_loaddup_pd(&hh_dbl[0]);
1497 : h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
1498 : h1_real = _SSE_XOR(h1_real, sign);
1499 : h1_imag = _SSE_XOR(h1_imag, sign);
1500 :
1501 : tmp1 = _SSE_MUL(h1_imag, x1);
1502 : #ifdef __ELPA_USE_FMA__
1503 : x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1504 : #else
1505 : x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1506 : #endif
1507 :
1508 : h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1509 : h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1510 : h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1511 : h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1512 :
1513 : h1_real = _SSE_XOR(h1_real, sign);
1514 : h1_imag = _SSE_XOR(h1_imag, sign);
1515 : h2_real = _SSE_XOR(h2_real, sign);
1516 : h2_imag = _SSE_XOR(h2_imag, sign);
1517 :
1518 : __SSE_DATATYPE tmp2 = _SSE_LOADU(s_dbl);
1519 : tmp1 = _SSE_MUL(h2_imag, tmp2);
1520 : #ifdef __ELPA_USE_FMA__
1521 : tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1522 : #else
1523 : tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1524 : #endif
1525 : _SSE_STOREU(s_dbl, tmp2);
1526 : h2_real = _mm_set1_pd(s_dbl[0]);
1527 : h2_imag = _mm_set1_pd(s_dbl[1]);
1528 :
1529 : // h2_real = _mm_loaddup_pd(&s_dbl[0]);
1530 : // h2_imag = _mm_loaddup_pd(&s_dbl[1]);
1531 :
1532 : tmp1 = _SSE_MUL(h1_imag, y1);
1533 : #ifdef __ELPA_USE_FMA__
1534 : y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1535 : #else
1536 : y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1537 : #endif
1538 :
1539 : tmp1 = _SSE_MUL(h2_imag, x1);
1540 : #ifdef __ELPA_USE_FMA__
1541 : y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1542 : #else
1543 : y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1544 : #endif
1545 :
1546 : q1 = _SSE_LOAD(&q_dbl[0]);
1547 :
1548 : q1 = _SSE_ADD(q1, y1);
1549 :
1550 : _SSE_STORE(&q_dbl[0], q1);
1551 :
1552 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1553 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1554 :
1555 : q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
1556 :
1557 : q1 = _SSE_ADD(q1, x1);
1558 :
1559 : tmp1 = _SSE_MUL(h2_imag, y1);
1560 : #ifdef __ELPA_USE_FMA__
1561 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1562 : #else
1563 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1564 : #endif
1565 :
1566 : _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
1567 :
1568 : for (i = 2; i < nb; i++)
1569 : {
1570 : q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
1571 :
1572 : h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
1573 : h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
1574 :
1575 : tmp1 = _SSE_MUL(h1_imag, x1);
1576 : #ifdef __ELPA_USE_FMA__
1577 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1578 : #else
1579 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1580 : #endif
1581 :
1582 : h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1583 : h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1584 :
1585 : tmp1 = _SSE_MUL(h2_imag, y1);
1586 : #ifdef __ELPA_USE_FMA__
1587 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1588 : #else
1589 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1590 : #endif
1591 :
1592 : _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
1593 : }
1594 :
1595 : h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1596 : h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1597 :
1598 : q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
1599 :
1600 : tmp1 = _SSE_MUL(h1_imag, x1);
1601 : #ifdef __ELPA_USE_FMA__
1602 : q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1603 : #else
1604 : q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1605 : #endif
1606 :
1607 : _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
1608 : }
1609 : #endif
|