Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : !
20 : ! More information can be found here:
21 : ! http://elpa.mpcdf.mpg.de/
22 : !
23 : ! ELPA is free software: you can redistribute it and/or modify
24 : ! it under the terms of the version 3 of the license of the
25 : ! GNU Lesser General Public License as published by the Free
26 : ! Software Foundation.
27 : !
28 : ! ELPA is distributed in the hope that it will be useful,
29 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
30 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 : ! GNU Lesser General Public License for more details.
32 : !
33 : ! You should have received a copy of the GNU Lesser General Public License
34 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
35 : !
36 : ! ELPA reflects a substantial effort on the part of the original
37 : ! ELPA consortium, and we ask you to respect the spirit of the
38 : ! license that we chose: i.e., please contribute any changes you
39 : ! may have back to the original ELPA library distribution, and keep
40 : ! any derivatives of ELPA under the same license that we chose for
41 : ! the original distribution, the GNU Lesser General Public License.
42 : !
43 : !
44 : ! --------------------------------------------------------------------------------------------------
45 : !
46 : ! This file contains the compute intensive kernels for the Householder transformations.
47 : ! It should be compiled with the highest possible optimization level.
48 : !
49 : ! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
50 : !
51 : ! Copyright of the original code rests with the authors inside the ELPA
52 : ! consortium. The copyright of any additional modifications shall rest
53 : ! with their original authors, but shall adhere to the licensing terms
54 : ! distributed along with the original code in the file "COPYING".
55 : !
56 : ! --------------------------------------------------------------------------------------------------
57 : #endif
58 :
59 : ! the intel compiler creates a temp copy of array q!
60 : ! this should be prevented if possible without using assumed size arrays
61 : subroutine double_hh_trafo_&
62 : &MATH_DATATYPE&
63 : &_generic_&
64 622592 : &PRECISION&
65 622592 : & (q, hh, nb, nq, ldq, ldh)
66 :
67 : use precision
68 : use iso_c_binding
69 : use elpa_abstract_impl
70 : implicit none
71 :
72 : !class(elpa_abstract_impl_t), intent(inout) :: obj
73 : integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
74 : #ifdef USE_ASSUMED_SIZE
75 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
76 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
77 : #else
78 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+1)
79 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
80 : #endif
81 :
82 : real(kind=C_DATATYPE_KIND) :: s
83 : integer(kind=ik) :: i
84 :
85 : ! equivalence(q(1,1),q_complex(1,1))
86 :
87 : ! Safety only:
88 :
89 : ! call obj%timer%start("kernel generic: double_hh_trafo_&
90 : ! &MATH_DATATYPE&
91 : ! &_generic" // &
92 : ! &PRECISION_SUFFIX &
93 : ! )
94 :
95 622592 : if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
96 :
97 : ! Calculate dot product of the two Householder vectors
98 :
99 622592 : s = hh(2,2)*1.0
100 39223296 : do i=3,nb
101 38600704 : s = s+hh(i,2)*hh(i-1,1)
102 : enddo
103 :
104 : ! Do the Householder transformations
105 :
106 : #ifndef USE_ASSUMED_SIZE
107 : ! ! assign real data to compplex pointer
108 : ! call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
109 : #endif
110 : ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
111 :
112 2654208 : do i=1,nq-8,12
113 :
114 : #ifdef USE_ASSUMED_SIZE
115 : call hh_trafo_kernel_12_generic_&
116 : &PRECISION&
117 1015808 : & (q(i,1),hh, nb, ldq, ldh, s)
118 : #else
119 : call hh_trafo_kernel_12_generic_&
120 : &PRECISION&
121 1015808 : & (q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
122 : #endif
123 :
124 : enddo
125 :
126 : ! i > nq-8 now, i.e. at most 8 rows remain
127 :
128 622592 : if(nq-i+1 > 4) then
129 :
130 : #ifdef USE_ASSUMED_SIZE
131 : call hh_trafo_kernel_8_generic_&
132 : &PRECISION&
133 90112 : & (q(i,1),hh, nb, ldq, ldh, s)
134 : #else
135 : call hh_trafo_kernel_8_generic_&
136 : &PRECISION&
137 90112 : & (q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
138 : #endif
139 :
140 442368 : else if(nq-i+1 > 0) then
141 :
142 : #ifdef USE_ASSUMED_SIZE
143 : call hh_trafo_kernel_4_generic_&
144 : &PRECISION&
145 196608 : & (q(i,1),hh, nb, ldq, ldh, s)
146 : #else
147 : call hh_trafo_kernel_4_generic_&
148 : &PRECISION&
149 196608 : & (q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
150 : #endif
151 :
152 : endif
153 :
154 : ! call obj%timer%stop("kernel generic: double_hh_trafo_&
155 : ! &MATH_DATATYPE&
156 : ! &_generic" // &
157 : ! &PRECISION_SUFFIX &
158 : ! )
159 :
160 622592 : end subroutine
161 :
162 : ! --------------------------------------------------------------------------------------------------
163 : ! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
164 : ! Please note that Q is declared complex*16 here.
165 : ! This is a hint for compilers that packed arithmetic can be used for Q
166 : ! (relevant for Intel SSE and BlueGene double hummer CPUs).
167 : ! --------------------------------------------------------------------------------------------------
168 :
169 : subroutine hh_trafo_kernel_12_generic_&
170 2031616 : &PRECISION&
171 2031616 : & (q, hh, nb, ldq, ldh, s)
172 : use precision
173 : implicit none
174 : integer(kind=ik), intent(in) :: nb, ldq, ldh
175 : #ifdef USE_ASSUMED_SIZE
176 : #ifdef PACK_REAL_TO_COMPLEX
177 : complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
178 : #else
179 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
180 : #endif
181 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
182 : #else
183 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:)
184 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,2)
185 : #endif
186 : real(kind=C_DATATYPE_KIND), intent(in) :: s
187 :
188 : #ifdef PACK_REAL_TO_COMPLEX
189 : complex(kind=SPECIAL_COMPLEX_DATATYPE) :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
190 : #else
191 : real(kind=C_DATATYPE_KIND) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
192 : y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
193 : #endif
194 : real(kind=C_DATATYPE_KIND) :: h1, h2, tau1, tau2
195 : integer(kind=ik) :: i
196 :
197 :
198 : !call obj%timer%start("kernel generic: hh_trafo_kernel_12_generic" // &
199 : ! &PRECISION_SUFFIX &
200 : ! )
201 :
202 2031616 : x1 = q(1,2)
203 2031616 : x2 = q(2,2)
204 2031616 : x3 = q(3,2)
205 2031616 : x4 = q(4,2)
206 2031616 : x5 = q(5,2)
207 2031616 : x6 = q(6,2)
208 : #ifndef PACK_REAL_TO_COMPLEX
209 1015808 : x7 = q(7,2)
210 1015808 : x8 = q(8,2)
211 1015808 : x9 = q(9,2)
212 1015808 : x10 = q(10,2)
213 1015808 : x11 = q(11,2)
214 1015808 : x12 = q(12,2)
215 : #endif
216 :
217 2031616 : y1 = q(1 ,1) + q(1, 2)*hh(2,2)
218 2031616 : y2 = q(2 ,1) + q(2, 2)*hh(2,2)
219 2031616 : y3 = q(3 ,1) + q(3, 2)*hh(2,2)
220 2031616 : y4 = q(4 ,1) + q(4, 2)*hh(2,2)
221 2031616 : y5 = q(5 ,1) + q(5, 2)*hh(2,2)
222 2031616 : y6 = q(6 ,1) + q(6, 2)*hh(2,2)
223 : #ifndef PACK_REAL_TO_COMPLEX
224 1015808 : y7 = q(7 ,1) + q(7, 2)*hh(2,2)
225 1015808 : y8 = q(8 ,1) + q(8, 2)*hh(2,2)
226 1015808 : y9 = q(9 ,1) + q(9, 2)*hh(2,2)
227 1015808 : y10 = q(10,1) + q(10,2)*hh(2,2)
228 1015808 : y11 = q(11,1) + q(11,2)*hh(2,2)
229 1015808 : y12 = q(12,1) + q(12,2)*hh(2,2)
230 : #endif
231 :
232 : #ifdef DOUBLE_PRECISION_REAL
233 : #if defined(SSE_ALIGNED)
234 : !DEC$ VECTOR ALIGNED
235 : #endif
236 : #endif
237 :
238 127991808 : do i=3,nb
239 125960192 : h1 = hh(i-1,1)
240 125960192 : h2 = hh(i,2)
241 125960192 : x1 = x1 + q(1, i)*h1
242 125960192 : y1 = y1 + q(1, i)*h2
243 125960192 : x2 = x2 + q(2, i)*h1
244 125960192 : y2 = y2 + q(2, i)*h2
245 125960192 : x3 = x3 + q(3, i)*h1
246 125960192 : y3 = y3 + q(3, i)*h2
247 125960192 : x4 = x4 + q(4, i)*h1
248 125960192 : y4 = y4 + q(4, i)*h2
249 125960192 : x5 = x5 + q(5, i)*h1
250 125960192 : y5 = y5 + q(5, i)*h2
251 125960192 : x6 = x6 + q(6, i)*h1
252 125960192 : y6 = y6 + q(6, i)*h2
253 : #ifndef PACK_REAL_TO_COMPLEX
254 62980096 : x7 = x7 + q(7, i)*h1
255 62980096 : y7 = y7 + q(7, i)*h2
256 62980096 : x8 = x8 + q(8, i)*h1
257 62980096 : y8 = y8 + q(8, i)*h2
258 62980096 : x9 = x9 + q(9, i)*h1
259 62980096 : y9 = y9 + q(9, i)*h2
260 62980096 : x10 = x10 + q(10,i)*h1
261 62980096 : y10 = y10 + q(10,i)*h2
262 62980096 : x11 = x11 + q(11,i)*h1
263 62980096 : y11 = y11 + q(11,i)*h2
264 62980096 : x12 = x12 + q(12,i)*h1
265 62980096 : y12 = y12 + q(12,i)*h2
266 : #endif
267 : enddo
268 :
269 2031616 : x1 = x1 + q(1,nb+1)*hh(nb,1)
270 2031616 : x2 = x2 + q(2,nb+1)*hh(nb,1)
271 2031616 : x3 = x3 + q(3,nb+1)*hh(nb,1)
272 2031616 : x4 = x4 + q(4,nb+1)*hh(nb,1)
273 2031616 : x5 = x5 + q(5,nb+1)*hh(nb,1)
274 2031616 : x6 = x6 + q(6,nb+1)*hh(nb,1)
275 : #ifndef PACK_REAL_TO_COMPLEX
276 1015808 : x7 = x7 + q(7, nb+1)*hh(nb,1)
277 1015808 : x8 = x8 + q(8, nb+1)*hh(nb,1)
278 1015808 : x9 = x9 + q(9, nb+1)*hh(nb,1)
279 1015808 : x10 = x10 + q(10,nb+1)*hh(nb,1)
280 1015808 : x11 = x11 + q(11,nb+1)*hh(nb,1)
281 1015808 : x12 = x12 + q(12,nb+1)*hh(nb,1)
282 :
283 : #endif
284 :
285 2031616 : tau1 = hh(1,1)
286 2031616 : tau2 = hh(1,2)
287 :
288 2031616 : h1 = -tau1
289 2031616 : x1 = x1 *h1
290 2031616 : x2 = x2 *h1
291 2031616 : x3 = x3 *h1
292 2031616 : x4 = x4 *h1
293 2031616 : x5 = x5 *h1
294 2031616 : x6 = x6 *h1
295 : #ifndef PACK_REAL_TO_COMPLEX
296 1015808 : x7 = x7 *h1
297 1015808 : x8 = x8 *h1
298 1015808 : x9 = x9 *h1
299 1015808 : x10 = x10*h1
300 1015808 : x11 = x11*h1
301 1015808 : x12 = x12*h1
302 : #endif
303 :
304 2031616 : h1 = -tau2
305 2031616 : h2 = -tau2*s
306 2031616 : y1 = y1 *h1 + x1 *h2
307 2031616 : y2 = y2 *h1 + x2 *h2
308 2031616 : y3 = y3 *h1 + x3 *h2
309 2031616 : y4 = y4 *h1 + x4 *h2
310 2031616 : y5 = y5 *h1 + x5 *h2
311 2031616 : y6 = y6 *h1 + x6 *h2
312 : #ifndef PACK_REAL_TO_COMPLEX
313 1015808 : y7 = y7 *h1 + x7 *h2
314 1015808 : y8 = y8 *h1 + x8 *h2
315 1015808 : y9 = y9 *h1 + x9 *h2
316 1015808 : y10 = y10*h1 + x10*h2
317 1015808 : y11 = y11*h1 + x11*h2
318 1015808 : y12 = y12*h1 + x12*h2
319 : #endif
320 2031616 : q(1,1) = q(1, 1) + y1
321 2031616 : q(2,1) = q(2, 1) + y2
322 2031616 : q(3,1) = q(3, 1) + y3
323 2031616 : q(4,1) = q(4, 1) + y4
324 2031616 : q(5,1) = q(5, 1) + y5
325 2031616 : q(6,1) = q(6, 1) + y6
326 : #ifndef PACK_REAL_TO_COMPLEX
327 1015808 : q(7 ,1) = q(7, 1) + y7
328 1015808 : q(8 ,1) = q(8, 1) + y8
329 1015808 : q(9 ,1) = q(9, 1) + y9
330 1015808 : q(10,1) = q(10,1) + y10
331 1015808 : q(11,1) = q(11,1) + y11
332 1015808 : q(12,1) = q(12,1) + y12
333 : #endif
334 :
335 2031616 : q(1, 2) = q(1, 2) + x1 + y1 *hh(2,2)
336 2031616 : q(2, 2) = q(2, 2) + x2 + y2 *hh(2,2)
337 2031616 : q(3, 2) = q(3, 2) + x3 + y3 *hh(2,2)
338 2031616 : q(4, 2) = q(4, 2) + x4 + y4 *hh(2,2)
339 2031616 : q(5, 2) = q(5, 2) + x5 + y5 *hh(2,2)
340 2031616 : q(6, 2) = q(6, 2) + x6 + y6 *hh(2,2)
341 : #ifndef PACK_REAL_TO_COMPLEX
342 1015808 : q(7, 2) = q(7, 2) + x7 + y7 *hh(2,2)
343 1015808 : q(8, 2) = q(8, 2) + x8 + y8 *hh(2,2)
344 1015808 : q(9, 2) = q(9, 2) + x9 + y9 *hh(2,2)
345 1015808 : q(10,2) = q(10,2) + x10 + y10*hh(2,2)
346 1015808 : q(11,2) = q(11,2) + x11 + y11*hh(2,2)
347 1015808 : q(12,2) = q(12,2) + x12 + y12*hh(2,2)
348 : #endif
349 :
350 : #ifdef DOUBLE_PRECISION_REAL
351 : #if defined(SSE_ALIGNED)
352 : !DEC$ VECTOR ALIGNED
353 : #endif
354 : #endif
355 :
356 127991808 : do i=3,nb
357 125960192 : h1 = hh(i-1,1)
358 125960192 : h2 = hh(i,2)
359 125960192 : q(1, i) = q(1,i) + x1 *h1 + y1 *h2
360 125960192 : q(2, i) = q(2,i) + x2 *h1 + y2 *h2
361 125960192 : q(3, i) = q(3,i) + x3 *h1 + y3 *h2
362 125960192 : q(4, i) = q(4,i) + x4 *h1 + y4 *h2
363 125960192 : q(5, i) = q(5,i) + x5 *h1 + y5 *h2
364 125960192 : q(6, i) = q(6,i) + x6 *h1 + y6 *h2
365 : #ifndef PACK_REAL_TO_COMPLEX
366 62980096 : q(7, i) = q(7, i) + x7 *h1 + y7 *h2
367 62980096 : q(8, i) = q(8, i) + x8 *h1 + y8 *h2
368 62980096 : q(9, i) = q(9, i) + x9 *h1 + y9 *h2
369 62980096 : q(10,i) = q(10,i) + x10*h1 + y10*h2
370 62980096 : q(11,i) = q(11,i) + x11*h1 + y11*h2
371 62980096 : q(12,i) = q(12,i) + x12*h1 + y12*h2
372 : #endif
373 : enddo
374 :
375 2031616 : q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
376 2031616 : q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
377 2031616 : q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
378 2031616 : q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
379 2031616 : q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
380 2031616 : q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
381 : #ifndef PACK_REAL_TO_COMPLEX
382 1015808 : q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
383 1015808 : q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
384 1015808 : q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
385 1015808 : q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
386 1015808 : q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
387 1015808 : q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
388 : #endif
389 :
390 : ! call obj%timer%stop("kernel generic: hh_trafo_kernel_12_generic" // &
391 : ! &PRECISION_SUFFIX &
392 : ! )
393 :
394 2031616 : end subroutine
395 : ! --------------------------------------------------------------------------------------------------
396 :
397 : subroutine hh_trafo_kernel_8_generic_&
398 180224 : &PRECISION&
399 180224 : & (q, hh, nb, ldq, ldh, s)
400 : use precision
401 : implicit none
402 : integer(kind=ik), intent(in) :: nb, ldq, ldh
403 : #ifdef USE_ASSUMED_SIZE
404 : #ifdef PACK_REAL_TO_COMPLEX
405 : complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
406 : #else
407 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
408 : #endif
409 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
410 : #else
411 : real(kind=C_DATATYPE_KIND), intent(inout):: q(:,:)
412 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,2)
413 : #endif
414 : real(kind=C_DATATYPE_KIND), intent(in) :: s
415 : #ifdef PACK_REAL_TO_COMPLEX
416 : complex(kind=SPECIAL_COMPLEX_DATATYPE) :: x1, x2, x3, x4, y1, y2, y3, y4
417 : #else
418 : real(kind=C_DATATYPE_KIND) :: x1, x2, x3, x4, x5, x6, x7, x8, &
419 : y1, y2, y3, y4, y5, y6, y7, y8
420 : #endif
421 : real(kind=C_DATATYPE_KIND) :: h1, h2, tau1, tau2
422 : integer(kind=ik) :: i
423 :
424 : ! call obj%timer%start("kernel generic: hh_trafo_kernel_8_generic" // &
425 : ! &PRECISION_SUFFIX &
426 : ! )
427 180224 : x1 = q(1,2)
428 180224 : x2 = q(2,2)
429 180224 : x3 = q(3,2)
430 180224 : x4 = q(4,2)
431 : #ifndef PACK_REAL_TO_COMPLEX
432 90112 : x5 = q(5,2)
433 90112 : x6 = q(6,2)
434 90112 : x7 = q(7,2)
435 90112 : x8 = q(8,2)
436 : #endif
437 :
438 180224 : y1 = q(1,1) + q(1,2)*hh(2,2)
439 180224 : y2 = q(2,1) + q(2,2)*hh(2,2)
440 180224 : y3 = q(3,1) + q(3,2)*hh(2,2)
441 180224 : y4 = q(4,1) + q(4,2)*hh(2,2)
442 : #ifndef PACK_REAL_TO_COMPLEX
443 90112 : y5 = q(5,1) + q(5,2)*hh(2,2)
444 90112 : y6 = q(6,1) + q(6,2)*hh(2,2)
445 90112 : y7 = q(7,1) + q(7,2)*hh(2,2)
446 90112 : y8 = q(8,1) + q(8,2)*hh(2,2)
447 : #endif
448 :
449 : #ifdef DOUBLE_PRECISION_REAL
450 : #if defined(SSE_ALIGNED)
451 : !DEC$ VECTOR ALIGNED
452 : #endif
453 : #endif
454 :
455 11354112 : do i=3,nb
456 11173888 : h1 = hh(i-1,1)
457 11173888 : h2 = hh(i,2)
458 11173888 : x1 = x1 + q(1,i)*h1
459 11173888 : y1 = y1 + q(1,i)*h2
460 11173888 : x2 = x2 + q(2,i)*h1
461 11173888 : y2 = y2 + q(2,i)*h2
462 11173888 : x3 = x3 + q(3,i)*h1
463 11173888 : y3 = y3 + q(3,i)*h2
464 11173888 : x4 = x4 + q(4,i)*h1
465 11173888 : y4 = y4 + q(4,i)*h2
466 : #ifndef PACK_REAL_TO_COMPLEX
467 5586944 : x5 = x5 + q(5,i)*h1
468 5586944 : y5 = y5 + q(5,i)*h2
469 5586944 : x6 = x6 + q(6,i)*h1
470 5586944 : y6 = y6 + q(6,i)*h2
471 5586944 : x7 = x7 + q(7,i)*h1
472 5586944 : y7 = y7 + q(7,i)*h2
473 5586944 : x8 = x8 + q(8,i)*h1
474 5586944 : y8 = y8 + q(8,i)*h2
475 : #endif
476 : enddo
477 :
478 180224 : x1 = x1 + q(1,nb+1)*hh(nb,1)
479 180224 : x2 = x2 + q(2,nb+1)*hh(nb,1)
480 180224 : x3 = x3 + q(3,nb+1)*hh(nb,1)
481 180224 : x4 = x4 + q(4,nb+1)*hh(nb,1)
482 : #ifndef PACK_REAL_TO_COMPLEX
483 90112 : x5 = x5 + q(5,nb+1)*hh(nb,1)
484 90112 : x6 = x6 + q(6,nb+1)*hh(nb,1)
485 90112 : x7 = x7 + q(7,nb+1)*hh(nb,1)
486 90112 : x8 = x8 + q(8,nb+1)*hh(nb,1)
487 : #endif
488 :
489 180224 : tau1 = hh(1,1)
490 180224 : tau2 = hh(1,2)
491 :
492 180224 : h1 = -tau1
493 180224 : x1 = x1*h1
494 180224 : x2 = x2*h1
495 180224 : x3 = x3*h1
496 180224 : x4 = x4*h1
497 : #ifndef PACK_REAL_TO_COMPLEX
498 90112 : x5 = x5*h1
499 90112 : x6 = x6*h1
500 90112 : x7 = x7*h1
501 90112 : x8 = x8*h1
502 : #endif
503 180224 : h1 = -tau2
504 180224 : h2 = -tau2*s
505 180224 : y1 = y1*h1 + x1*h2
506 180224 : y2 = y2*h1 + x2*h2
507 180224 : y3 = y3*h1 + x3*h2
508 180224 : y4 = y4*h1 + x4*h2
509 : #ifndef PACK_REAL_TO_COMPLEX
510 90112 : y5 = y5*h1 + x5*h2
511 90112 : y6 = y6*h1 + x6*h2
512 90112 : y7 = y7*h1 + x7*h2
513 90112 : y8 = y8*h1 + x8*h2
514 : #endif
515 180224 : q(1,1) = q(1,1) + y1
516 180224 : q(2,1) = q(2,1) + y2
517 180224 : q(3,1) = q(3,1) + y3
518 180224 : q(4,1) = q(4,1) + y4
519 : #ifndef PACK_REAL_TO_COMPLEX
520 90112 : q(5,1) = q(5,1) + y5
521 90112 : q(6,1) = q(6,1) + y6
522 90112 : q(7,1) = q(7,1) + y7
523 90112 : q(8,1) = q(8,1) + y8
524 : #endif
525 180224 : q(1,2) = q(1,2) + x1 + y1*hh(2,2)
526 180224 : q(2,2) = q(2,2) + x2 + y2*hh(2,2)
527 180224 : q(3,2) = q(3,2) + x3 + y3*hh(2,2)
528 180224 : q(4,2) = q(4,2) + x4 + y4*hh(2,2)
529 : #ifndef PACK_REAL_TO_COMPLEX
530 90112 : q(5,2) = q(5,2) + x5 + y5*hh(2,2)
531 90112 : q(6,2) = q(6,2) + x6 + y6*hh(2,2)
532 90112 : q(7,2) = q(7,2) + x7 + y7*hh(2,2)
533 90112 : q(8,2) = q(8,2) + x8 + y8*hh(2,2)
534 : #endif
535 :
536 : #ifdef DOUBLE_PRECISION_REAL
537 : #if defined(SSE_ALIGNED)
538 : !DEC$ VECTOR ALIGNED
539 : #endif
540 : #endif
541 :
542 11354112 : do i=3,nb
543 11173888 : h1 = hh(i-1,1)
544 11173888 : h2 = hh(i,2)
545 11173888 : q(1,i) = q(1,i) + x1*h1 + y1*h2
546 11173888 : q(2,i) = q(2,i) + x2*h1 + y2*h2
547 11173888 : q(3,i) = q(3,i) + x3*h1 + y3*h2
548 11173888 : q(4,i) = q(4,i) + x4*h1 + y4*h2
549 : #ifndef PACK_REAL_TO_COMPLEX
550 5586944 : q(5,i) = q(5,i) + x5*h1 + y5*h2
551 5586944 : q(6,i) = q(6,i) + x6*h1 + y6*h2
552 5586944 : q(7,i) = q(7,i) + x7*h1 + y7*h2
553 5586944 : q(8,i) = q(8,i) + x8*h1 + y8*h2
554 : #endif
555 : enddo
556 :
557 180224 : q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
558 180224 : q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
559 180224 : q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
560 180224 : q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
561 : #ifndef PACK_REAL_TO_COMPLEX
562 90112 : q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
563 90112 : q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
564 90112 : q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
565 90112 : q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
566 : #endif
567 :
568 :
569 : ! call obj%timer%stop("kernel generic: hh_trafo_kernel_8_generic" // &
570 : ! &PRECISION_SUFFIX &
571 : ! )
572 :
573 180224 : end subroutine
574 : ! --------------------------------------------------------------------------------------------------
575 :
576 : subroutine hh_trafo_kernel_4_generic_&
577 393216 : &PRECISION&
578 393216 : & (q, hh, nb, ldq, ldh, s)
579 :
580 : use precision
581 : implicit none
582 : integer(kind=ik), intent(in) :: nb, ldq, ldh
583 : #ifdef USE_ASSUMED_SIZE
584 : #ifdef PACK_REAL_TO_COMPLEX
585 : complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
586 : #else
587 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
588 : #endif
589 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
590 : #else
591 : real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1)
592 : real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,2)
593 : #endif
594 : real(kind=C_DATATYPE_KIND), intent(in) :: s
595 :
596 : #ifdef PACK_REAL_TO_COMPLEX
597 : complex(kind=SPECIAL_COMPLEX_DATATYPE) :: x1, x2, y1, y2
598 : #else
599 : real(kind=C_DATATYPE_KIND) :: x1, x2, x3, x4, y1, y2, y3, y4
600 : #endif
601 : real(kind=C_DATATYPE_KIND) :: h1, h2, tau1, tau2
602 : integer(kind=ik) :: i
603 :
604 : ! call obj%timer%start("kernel generic: hh_trafo_kernel_4_generic" // &
605 : ! &PRECISION_SUFFIX &
606 : ! )
607 393216 : x1 = q(1,2)
608 393216 : x2 = q(2,2)
609 : #ifndef PACK_REAL_TO_COMPLEX
610 196608 : x3 = q(3,2)
611 196608 : x4 = q(4,2)
612 : #endif
613 :
614 393216 : y1 = q(1,1) + q(1,2)*hh(2,2)
615 393216 : y2 = q(2,1) + q(2,2)*hh(2,2)
616 : #ifndef PACK_REAL_TO_COMPLEX
617 196608 : y3 = q(3,1) + q(3,2)*hh(2,2)
618 196608 : y4 = q(4,1) + q(4,2)*hh(2,2)
619 : #endif
620 :
621 : #ifdef DOUBLE_PRECISION_REAL
622 : #if defined(SSE_ALIGNED)
623 : !DEC$ VECTOR ALIGNED
624 : #endif
625 : #endif
626 :
627 24772608 : do i=3,nb
628 24379392 : h1 = hh(i-1,1)
629 24379392 : h2 = hh(i,2)
630 24379392 : x1 = x1 + q(1,i)*h1
631 24379392 : y1 = y1 + q(1,i)*h2
632 24379392 : x2 = x2 + q(2,i)*h1
633 24379392 : y2 = y2 + q(2,i)*h2
634 : #ifndef PACK_REAL_TO_COMPLEX
635 12189696 : x3 = x3 + q(3,i)*h1
636 12189696 : y3 = y3 + q(3,i)*h2
637 12189696 : x4 = x4 + q(4,i)*h1
638 12189696 : y4 = y4 + q(4,i)*h2
639 : #endif
640 : enddo
641 :
642 393216 : x1 = x1 + q(1,nb+1)*hh(nb,1)
643 393216 : x2 = x2 + q(2,nb+1)*hh(nb,1)
644 : #ifndef PACK_REAL_TO_COMPLEX
645 196608 : x3 = x3 + q(3,nb+1)*hh(nb,1)
646 196608 : x4 = x4 + q(4,nb+1)*hh(nb,1)
647 : #endif
648 :
649 393216 : tau1 = hh(1,1)
650 393216 : tau2 = hh(1,2)
651 :
652 393216 : h1 = -tau1
653 393216 : x1 = x1*h1
654 393216 : x2 = x2*h1
655 : #ifndef PACK_REAL_TO_COMPLEX
656 196608 : x3 = x3*h1
657 196608 : x4 = x4*h1
658 : #endif
659 393216 : h1 = -tau2
660 393216 : h2 = -tau2*s
661 393216 : y1 = y1*h1 + x1*h2
662 393216 : y2 = y2*h1 + x2*h2
663 : #ifndef PACK_REAL_TO_COMPLEX
664 196608 : y3 = y3*h1 + x3*h2
665 196608 : y4 = y4*h1 + x4*h2
666 : #endif
667 :
668 393216 : q(1,1) = q(1,1) + y1
669 393216 : q(2,1) = q(2,1) + y2
670 : #ifndef PACK_REAL_TO_COMPLEX
671 196608 : q(3,1) = q(3,1) + y3
672 196608 : q(4,1) = q(4,1) + y4
673 : #endif
674 393216 : q(1,2) = q(1,2) + x1 + y1*hh(2,2)
675 393216 : q(2,2) = q(2,2) + x2 + y2*hh(2,2)
676 : #ifndef PACK_REAL_TO_COMPLEX
677 196608 : q(3,2) = q(3,2) + x3 + y3*hh(2,2)
678 196608 : q(4,2) = q(4,2) + x4 + y4*hh(2,2)
679 : #endif
680 :
681 : #ifdef DOUBLE_PRECISION_REAL
682 : #if defined(SSE_ALIGNED)
683 : !DEC$ VECTOR ALIGNED
684 : #endif
685 : #endif
686 24772608 : do i=3,nb
687 24379392 : h1 = hh(i-1,1)
688 24379392 : h2 = hh(i,2)
689 24379392 : q(1,i) = q(1,i) + x1*h1 + y1*h2
690 24379392 : q(2,i) = q(2,i) + x2*h1 + y2*h2
691 : #ifndef PACK_REAL_TO_COMPLEX
692 12189696 : q(3,i) = q(3,i) + x3*h1 + y3*h2
693 12189696 : q(4,i) = q(4,i) + x4*h1 + y4*h2
694 : #endif
695 : enddo
696 :
697 393216 : q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
698 393216 : q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
699 : #ifndef PACK_REAL_TO_COMPLEX
700 196608 : q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
701 196608 : q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
702 : #endif
703 :
704 : ! call obj%timer%stop("kernel generic: hh_trafo_kernel_4_generic" // &
705 : ! &PRECISION_SUFFIX &
706 : ! )
707 :
708 393216 : end subroutine
|