Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : !
20 : ! More information can be found here:
21 : ! http://elpa.mpcdf.mpg.de/
22 : !
23 : ! ELPA is free software: you can redistribute it and/or modify
24 : ! it under the terms of the version 3 of the license of the
25 : ! GNU Lesser General Public License as published by the Free
26 : ! Software Foundation.
27 : !
28 : ! ELPA is distributed in the hope that it will be useful,
29 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
30 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 : ! GNU Lesser General Public License for more details.
32 : !
33 : ! You should have received a copy of the GNU Lesser General Public License
34 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
35 : !
36 : ! ELPA reflects a substantial effort on the part of the original
37 : ! ELPA consortium, and we ask you to respect the spirit of the
38 : ! license that we chose: i.e., please contribute any changes you
39 : ! may have back to the original ELPA library distribution, and keep
40 : ! any derivatives of ELPA under the same license that we chose for
41 : ! the original distribution, the GNU Lesser General Public License.
42 : !
43 : ! This file was written by A. Marek, MPCDF
44 : #endif
45 :
46 : subroutine compute_hh_trafo_&
47 : &MATH_DATATYPE&
48 : #ifdef WITH_OPENMP
49 : &_openmp_&
50 : #else
51 : &_&
52 : #endif
53 3491712 : &PRECISION &
54 : (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, &
55 : #ifdef WITH_OPENMP
56 : max_threads, l_nev, &
57 : #endif
58 3491712 : a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
59 : #if REALCASE == 1
60 : hh_dot_dev, &
61 : #endif
62 : hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
63 : #ifdef WITH_OPENMP
64 : my_thread, thread_width, &
65 : #else
66 : last_stripe_width, &
67 : #endif
68 : kernel)
69 :
70 : use precision
71 : use elpa_abstract_impl
72 : use iso_c_binding
73 : #if REALCASE == 1
74 :
75 : use single_hh_trafo_real
76 : #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
77 : use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
78 : #endif
79 :
80 : #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
81 : use real_generic_kernel !, only : double_hh_trafo_generic
82 : #endif
83 :
84 : #if defined(WITH_REAL_BGP_KERNEL)
85 : use real_bgp_kernel !, only : double_hh_trafo_bgp
86 : #endif
87 :
88 : #if defined(WITH_REAL_BGQ_KERNEL)
89 : use real_bgq_kernel !, only : double_hh_trafo_bgq
90 : #endif
91 :
92 : #endif /* REALCASE */
93 :
94 : #if COMPLEXCASE == 1
95 :
96 : #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
97 : use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
98 : #endif
99 : #if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
100 : use complex_generic_kernel !, only : single_hh_trafo_complex_generic
101 : #endif
102 :
103 : #endif /* COMPLEXCASE */
104 :
105 : use cuda_c_kernel
106 : use cuda_functions
107 :
108 : use elpa_generated_fortran_interfaces
109 :
110 : implicit none
111 : class(elpa_abstract_impl_t), intent(inout) :: obj
112 : logical, intent(in) :: useGPU, wantDebug
113 : real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
114 : integer(kind=lik) :: kernel_flops
115 : integer(kind=ik), intent(in) :: nbw, max_blk_size
116 : #if REALCASE == 1
117 : real(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
118 : #endif
119 : #if COMPLEXCASE == 1
120 : complex(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
121 : #endif
122 : integer(kind=ik), intent(in) :: a_off
123 :
124 : integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
125 :
126 : #ifndef WITH_OPENMP
127 : integer(kind=ik), intent(in) :: last_stripe_width
128 : #if REALCASE == 1
129 : ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
130 : real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
131 : #endif
132 : #if COMPLEXCASE == 1
133 : ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
134 : complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:)
135 : #endif
136 :
137 : #else /* WITH_OPENMP */
138 : integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
139 : #if REALCASE == 1
140 : ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
141 : real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
142 : #endif
143 : #if COMPLEXCASE == 1
144 : ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
145 : complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:,:)
146 : #endif
147 :
148 : #endif /* WITH_OPENMP */
149 :
150 : integer(kind=ik), intent(in) :: kernel
151 :
152 : integer(kind=c_intptr_t) :: a_dev
153 : integer(kind=c_intptr_t) :: bcast_buffer_dev
154 : #if REALCASE == 1
155 : integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
156 : #endif
157 : integer(kind=c_intptr_t) :: hh_tau_dev
158 : integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
159 :
160 : ! Private variables in OMP regions (my_thread) should better be in the argument list!
161 : integer(kind=ik) :: off, ncols, istripe
162 : #ifdef WITH_OPENMP
163 : integer(kind=ik) :: my_thread, noff
164 : #endif
165 : integer(kind=ik) :: j, nl, jj, jjj, n_times
166 : #if REALCASE == 1
167 2588544 : real(kind=C_DATATYPE_KIND) :: w(nbw,6)
168 : #endif
169 : #if COMPLEXCASE == 1
170 4394880 : complex(kind=C_DATATYPE_KIND) :: w(nbw,2)
171 : #endif
172 : real(kind=c_double) :: ttt ! MPI_WTIME always needs double
173 :
174 :
175 3491712 : j = -99
176 :
177 3491712 : if (wantDebug) then
178 0 : if (useGPU .and. &
179 : #if REALCASE == 1
180 : ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
181 : #endif
182 : #if COMPLEXCASE == 1
183 : ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
184 : #endif
185 0 : print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
186 0 : stop
187 : endif
188 : endif
189 :
190 : #if REALCASE == 1
191 1294272 : if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
192 : #endif
193 : #if COMPLEXCASE == 1
194 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
195 : #endif
196 : ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
197 0 : if (ncols < 1) then
198 0 : if (wantDebug) then
199 0 : print *, "Returning early from compute_hh_trafo"
200 : endif
201 0 : return
202 : endif
203 : endif
204 :
205 3491712 : if (wantDebug) call obj%timer%start("compute_hh_trafo_&
206 : &MATH_DATATYPE&
207 : #ifdef WITH_OPENMP
208 : &_openmp" // &
209 : #else
210 : &" // &
211 : #endif
212 : &PRECISION_SUFFIX &
213 0 : )
214 :
215 :
216 : #ifdef WITH_OPENMP
217 1745856 : if (my_thread==1) then
218 : #endif
219 3491712 : ttt = mpi_wtime()
220 : #ifdef WITH_OPENMP
221 : endif
222 : #endif
223 :
224 : #ifdef WITH_OPENMP
225 :
226 : #if REALCASE == 1
227 647136 : if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
228 : print *,"compute_hh_trafo_&
229 : &MATH_DATATYPE&
230 0 : &_GPU OPENMP: not yet implemented"
231 0 : stop 1
232 : endif
233 : #endif
234 : #if COMPLEXCASE == 1
235 1098720 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
236 : print *,"compute_hh_trafo_&
237 : &MATH_DATATYPE&
238 0 : &_GPU OPENMP: not yet implemented"
239 0 : stop 1
240 : endif
241 : #endif
242 : #endif /* WITH_OPENMP */
243 :
244 : #ifndef WITH_OPENMP
245 1745856 : nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
246 : #else /* WITH_OPENMP */
247 :
248 1745856 : if (istripe<stripe_count) then
249 1428480 : nl = stripe_width
250 : else
251 317376 : noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
252 317376 : nl = min(my_thread*thread_width-noff, l_nev-noff)
253 317376 : if (nl<=0) then
254 0 : if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
255 : &MATH_DATATYPE&
256 : #ifdef WITH_OPENMP
257 : &_openmp" // &
258 : #else
259 : &" // &
260 : #endif
261 : &PRECISION_SUFFIX &
262 0 : )
263 :
264 0 : return
265 : endif
266 : endif
267 : #endif /* not WITH_OPENMP */
268 :
269 : #if REALCASE == 1
270 : ! GPU kernel real
271 1294272 : if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
272 0 : if (wantDebug) then
273 0 : call obj%timer%start("compute_hh_trafo: GPU")
274 : endif
275 : dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
276 : &PRECISION&
277 : &_&
278 0 : &MATH_DATATYPE
279 :
280 : call launch_compute_hh_trafo_gpu_kernel_&
281 : &MATH_DATATYPE&
282 : &_&
283 : &PRECISION&
284 0 : & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
285 : #endif /* REALCASE */
286 : #if COMPLEXCASE == 1
287 : ! GPU kernel complex
288 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
289 0 : if (wantDebug) then
290 0 : call obj%timer%start("compute_hh_trafo: GPU")
291 : endif
292 :
293 : dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
294 : &PRECISION&
295 : &_&
296 0 : &MATH_DATATYPE
297 :
298 : dev_offset_1 = (0 + ( off-1 )* nbw) * size_of_&
299 : &PRECISION&
300 : &_&
301 0 : &MATH_DATATYPE
302 :
303 : dev_offset_2 =( off-1 )* size_of_&
304 : &PRECISION&
305 : &_&
306 0 : &MATH_DATATYPE
307 :
308 : call launch_compute_hh_trafo_gpu_kernel_&
309 : &MATH_DATATYPE&
310 : &_&
311 : &PRECISION&
312 : & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
313 0 : hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
314 :
315 :
316 : #endif /* COMPLEXCASE */
317 0 : if (wantDebug) then
318 0 : call obj%timer%stop("compute_hh_trafo: GPU")
319 : endif
320 :
321 : else ! not CUDA kernel
322 :
323 3491712 : if (wantDebug) then
324 0 : call obj%timer%start("compute_hh_trafo: CPU")
325 : endif
326 : #if REALCASE == 1
327 : #ifndef WITH_FIXED_REAL_KERNEL
328 : if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
329 : kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
330 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
331 : kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
332 : kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
333 : kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
334 : kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
335 : kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
336 : kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
337 1294272 : kernel .eq. ELPA_2STAGE_REAL_BGP .or. &
338 : kernel .eq. ELPA_2STAGE_REAL_BGQ) then
339 : #endif /* not WITH_FIXED_REAL_KERNEL */
340 :
341 : #endif /* REALCASE */
342 : !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
343 :
344 : !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
345 : #if REALCASE == 1
346 : ! generic kernel real case
347 : #if defined(WITH_REAL_GENERIC_KERNEL)
348 : #ifndef WITH_FIXED_REAL_KERNEL
349 987840 : if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
350 : #endif /* not WITH_FIXED_REAL_KERNEL */
351 :
352 510720 : do j = ncols, 2, -2
353 466944 : w(:,1) = bcast_buffer(1:nbw,j+off)
354 466944 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
355 :
356 : #ifdef WITH_OPENMP
357 :
358 : #ifdef USE_ASSUMED_SIZE
359 : call double_hh_trafo_&
360 : &MATH_DATATYPE&
361 : &_generic_&
362 : &PRECISION&
363 116736 : & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
364 :
365 : #else
366 : call double_hh_trafo_&
367 : &MATH_DATATYPE&
368 : &_generic_&
369 : &PRECISION&
370 : & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
371 116736 : nbw, nl, stripe_width, nbw)
372 : #endif
373 :
374 : #else /* WITH_OPENMP */
375 :
376 : #ifdef USE_ASSUMED_SIZE
377 : call double_hh_trafo_&
378 : &MATH_DATATYPE&
379 : &_generic_&
380 : &PRECISION&
381 116736 : & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
382 :
383 : #else
384 : call double_hh_trafo_&
385 : &MATH_DATATYPE&
386 : &_generic_&
387 : &PRECISION&
388 116736 : & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
389 : #endif
390 : #endif /* WITH_OPENMP */
391 :
392 : enddo
393 :
394 : #ifndef WITH_FIXED_REAL_KERNEL
395 : endif
396 : #endif /* not WITH_FIXED_REAL_KERNEL */
397 : #endif /* WITH_REAL_GENERIC_KERNEL */
398 :
399 : #endif /* REALCASE == 1 */
400 :
401 : #if COMPLEXCASE == 1
402 : ! generic kernel complex case
403 : #if defined(WITH_COMPLEX_GENERIC_KERNEL)
404 : #ifndef WITH_FIXED_COMPLEX_KERNEL
405 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
406 2197440 : kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
407 : kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
408 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
409 72576 : ttt = mpi_wtime()
410 1558656 : do j = ncols, 1, -1
411 : #ifdef WITH_OPENMP
412 : #ifdef USE_ASSUMED_SIZE
413 :
414 : call single_hh_trafo_&
415 : &MATH_DATATYPE&
416 : &_generic_&
417 : &PRECISION&
418 371520 : & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
419 : #else
420 : call single_hh_trafo_&
421 : &MATH_DATATYPE&
422 : &_generic_&
423 : &PRECISION&
424 : & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
425 371520 : bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
426 : #endif
427 :
428 : #else /* WITH_OPENMP */
429 :
430 : #ifdef USE_ASSUMED_SIZE
431 : call single_hh_trafo_&
432 : &MATH_DATATYPE&
433 : &_generic_&
434 : &PRECISION&
435 371520 : & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
436 : #else
437 : call single_hh_trafo_&
438 : &MATH_DATATYPE&
439 : &_generic_&
440 : &PRECISION&
441 : & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
442 371520 : nbw, nl, stripe_width)
443 : #endif
444 : #endif /* WITH_OPENMP */
445 :
446 : enddo
447 : #ifndef WITH_FIXED_COMPLEX_KERNEL
448 : endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
449 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
450 : #endif /* WITH_COMPLEX_GENERIC_KERNEL */
451 :
452 : #endif /* COMPLEXCASE */
453 :
454 : #if REALCASE == 1
455 :
456 :
457 : ! generic simple real kernel
458 : #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
459 : #ifndef WITH_FIXED_REAL_KERNEL
460 987840 : if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
461 : #endif /* not WITH_FIXED_REAL_KERNEL */
462 645120 : do j = ncols, 2, -2
463 589824 : w(:,1) = bcast_buffer(1:nbw,j+off)
464 589824 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
465 : #ifdef WITH_OPENMP
466 :
467 : #ifdef USE_ASSUMED_SIZE
468 : call double_hh_trafo_&
469 : &MATH_DATATYPE&
470 : &_generic_simple_&
471 : &PRECISION&
472 147456 : & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
473 : #else
474 : call double_hh_trafo_&
475 : &MATH_DATATYPE&
476 : &_generic_simple_&
477 : &PRECISION&
478 147456 : & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
479 :
480 : #endif
481 :
482 : #else /* WITH_OPENMP */
483 :
484 : #ifdef USE_ASSUMED_SIZE
485 : call double_hh_trafo_&
486 : &MATH_DATATYPE&
487 : &_generic_simple_&
488 : &PRECISION&
489 147456 : & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
490 : #else
491 : call double_hh_trafo_&
492 : &MATH_DATATYPE&
493 : &_generic_simple_&
494 : &PRECISION&
495 147456 : & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
496 :
497 : #endif
498 :
499 : #endif /* WITH_OPENMP */
500 :
501 : enddo
502 : #ifndef WITH_FIXED_REAL_KERNEL
503 : endif
504 : #endif /* not WITH_FIXED_REAL_KERNEL */
505 : #endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
506 :
507 : #endif /* REALCASE */
508 :
509 : #if COMPLEXCASE == 1
510 : ! generic simple complex case
511 :
512 : #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
513 : #ifndef WITH_FIXED_COMPLEX_KERNEL
514 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
515 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
516 96768 : ttt = mpi_wtime()
517 2078208 : do j = ncols, 1, -1
518 : #ifdef WITH_OPENMP
519 : #ifdef USE_ASSUMED_SIZE
520 : call single_hh_trafo_&
521 : &MATH_DATATYPE&
522 : &_generic_simple_&
523 : &PRECISION&
524 495360 : & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
525 : #else
526 : call single_hh_trafo_&
527 : &MATH_DATATYPE&
528 : &_generic_simple_&
529 : &PRECISION&
530 : & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
531 495360 : nbw, nl, stripe_width)
532 : #endif
533 :
534 : #else /* WITH_OPENMP */
535 :
536 : #ifdef USE_ASSUMED_SIZE
537 : call single_hh_trafo_&
538 : &MATH_DATATYPE&
539 : &_generic_simple_&
540 : &PRECISION&
541 495360 : & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
542 : #else
543 : call single_hh_trafo_&
544 : &MATH_DATATYPE&
545 : &_generic_simple_&
546 : &PRECISION&
547 : & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
548 495360 : nbw, nl, stripe_width)
549 : #endif
550 :
551 : #endif /* WITH_OPENMP */
552 : enddo
553 : #ifndef WITH_FIXED_COMPLEX_KERNEL
554 : endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
555 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
556 : #endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
557 :
558 : #endif /* COMPLEXCASE */
559 :
560 : #if REALCASE == 1
561 : ! sse assembly kernel real case
562 : #if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
563 : #ifndef WITH_FIXED_REAL_KERNEL
564 987840 : if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
565 :
566 : #endif /* not WITH_FIXED_REAL_KERNEL */
567 510720 : do j = ncols, 2, -2
568 466944 : w(:,1) = bcast_buffer(1:nbw,j+off)
569 466944 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
570 : #ifdef WITH_OPENMP
571 : call double_hh_trafo_&
572 : &MATH_DATATYPE&
573 : &_&
574 : &PRECISION&
575 : &_sse_assembly&
576 233472 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
577 : #else
578 : call double_hh_trafo_&
579 : &MATH_DATATYPE&
580 : &_&
581 : &PRECISION&
582 : &_sse_assembly&
583 233472 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
584 : #endif
585 : enddo
586 : #ifndef WITH_FIXED_REAL_KERNEL
587 : endif
588 : #endif /* not WITH_FIXED_REAL_KERNEL */
589 : #endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
590 :
591 : #endif /* REALCASE */
592 :
593 : #if COMPLEXCASE == 1
594 :
595 : ! sse assembly kernel complex case
596 : #if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
597 : #ifndef WITH_FIXED_COMPLEX_KERNEL
598 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
599 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
600 72576 : ttt = mpi_wtime()
601 1558656 : do j = ncols, 1, -1
602 : #ifdef WITH_OPENMP
603 : call single_hh_trafo_&
604 : &MATH_DATATYPE&
605 : &_&
606 : &PRECISION&
607 : &_sse_assembly&
608 743040 : & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
609 : #else
610 : call single_hh_trafo_&
611 : &MATH_DATATYPE&
612 : &_&
613 : &PRECISION&
614 : &_sse_assembly&
615 743040 : & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
616 : #endif
617 : enddo
618 : #ifndef WITH_FIXED_COMPLEX_KERNEL
619 : endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
620 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
621 : #endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
622 : #endif /* COMPLEXCASE */
623 :
624 : #if REALCASE == 1
625 : ! no sse, vsx, sparc64 block1 real kernel
626 : #endif
627 :
628 : #if COMPLEXCASE == 1
629 :
630 : ! sparc64 block1 complex kernel
631 : #if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
632 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
633 : ! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
634 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
635 : !
636 : !#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
637 : ! ttt = mpi_wtime()
638 : ! do j = ncols, 1, -1
639 : !#ifdef WITH_OPENMP
640 : ! call single_hh_trafo_&
641 : ! &MATH_DATATYPE&
642 : ! &_sparc64_1hv_&
643 : ! &PRECISION&
644 : ! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
645 : !#else
646 : ! call single_hh_trafo_&
647 : ! &MATH_DATATYPE&
648 : ! &_sparc64_1hv_&
649 : ! &PRECISION&
650 : ! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
651 : !#endif
652 : ! enddo
653 : !#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
654 : !
655 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
656 : ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
657 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
658 : #endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
659 :
660 : #endif /* COMPLEXCASE */
661 :
662 :
663 : #if COMPLEXCASE == 1
664 :
665 : ! vsx block1 complex kernel
666 : #if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
667 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
668 : ! if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
669 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
670 : !
671 : !#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
672 : ! ttt = mpi_wtime()
673 : ! do j = ncols, 1, -1
674 : !#ifdef WITH_OPENMP
675 : ! call single_hh_trafo_&
676 : ! &MATH_DATATYPE&
677 : ! &_vsx_1hv_&
678 : ! &PRECISION&
679 : ! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
680 : !#else
681 : ! call single_hh_trafo_&
682 : ! &MATH_DATATYPE&
683 : ! &_vsx_1hv_&
684 : ! &PRECISION&
685 : ! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
686 : !#endif
687 : ! enddo
688 : !#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
689 : !
690 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
691 : ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
692 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
693 : #endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
694 :
695 : #endif /* COMPLEXCASE */
696 :
697 :
698 : #if COMPLEXCASE == 1
699 :
700 : ! sse block1 complex kernel
701 : #if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
702 : #ifndef WITH_FIXED_COMPLEX_KERNEL
703 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
704 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
705 :
706 : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
707 72576 : ttt = mpi_wtime()
708 1558656 : do j = ncols, 1, -1
709 : #ifdef WITH_OPENMP
710 : call single_hh_trafo_&
711 : &MATH_DATATYPE&
712 : &_sse_1hv_&
713 : &PRECISION&
714 743040 : & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
715 : #else
716 : call single_hh_trafo_&
717 : &MATH_DATATYPE&
718 : &_sse_1hv_&
719 : &PRECISION&
720 743040 : & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
721 : #endif
722 : enddo
723 : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
724 :
725 : #ifndef WITH_FIXED_COMPLEX_KERNEL
726 : endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
727 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
728 : #endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
729 :
730 : #endif /* COMPLEXCASE */
731 :
732 : #if REALCASE == 1
733 : !no avx block1 real kernel
734 : #endif /* REALCASE */
735 :
736 : #if COMPLEXCASE == 1
737 :
738 : ! avx block1 complex kernel
739 : #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
740 : #ifndef WITH_FIXED_COMPLEX_KERNEL
741 2197440 : if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
742 : (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
743 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
744 :
745 : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
746 868896 : ttt = mpi_wtime()
747 74396448 : do j = ncols, 1, -1
748 : #ifdef WITH_OPENMP
749 : call single_hh_trafo_&
750 : &MATH_DATATYPE&
751 : &_avx_avx2_1hv_&
752 : &PRECISION&
753 36763776 : & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
754 : #else
755 : call single_hh_trafo_&
756 : &MATH_DATATYPE&
757 : &_avx_avx2_1hv_&
758 : &PRECISION&
759 36763776 : & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
760 : #endif
761 : enddo
762 : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
763 :
764 : #ifndef WITH_FIXED_COMPLEX_KERNEL
765 : endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
766 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
767 : #endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
768 :
769 : #endif /* COMPLEXCASE */
770 :
771 : #if REALCASE == 1
772 : ! no avx512 block1 real kernel
773 : #endif /* REALCASE */
774 :
775 : #if COMPLEXCASE == 1
776 :
777 : ! avx512 block1 complex kernel
778 : #if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
779 : #ifndef WITH_FIXED_COMPLEX_KERNEL
780 1135008 : if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
781 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
782 :
783 : #if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
784 760032 : ttt = mpi_wtime()
785 72058464 : do j = ncols, 1, -1
786 : #ifdef WITH_OPENMP
787 : call single_hh_trafo_&
788 : &MATH_DATATYPE&
789 : &_avx512_1hv_&
790 : &PRECISION&
791 35649216 : & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
792 : #else
793 : call single_hh_trafo_&
794 : &MATH_DATATYPE&
795 : &_avx512_1hv_&
796 : &PRECISION&
797 35649216 : & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
798 : #endif
799 : enddo
800 : #endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
801 :
802 : #ifndef WITH_FIXED_COMPLEX_KERNEL
803 : endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
804 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
805 : #endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
806 : #endif /* COMPLEXCASE */
807 :
808 : #if REALCASE == 1
809 : ! implementation of sparc64 block 2 real case
810 : #if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)
811 :
812 : #ifndef WITH_FIXED_REAL_KERNEL
813 : if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then
814 :
815 : #endif /* not WITH_FIXED_REAL_KERNEL */
816 :
817 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
818 : do j = ncols, 2, -2
819 : w(:,1) = bcast_buffer(1:nbw,j+off)
820 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
821 : #ifdef WITH_OPENMP
822 : call double_hh_trafo_&
823 : &MATH_DATATYPE&
824 : &_sparc64_2hv_&
825 : &PRECISION &
826 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
827 : #else
828 : call double_hh_trafo_&
829 : &MATH_DATATYPE&
830 : &_sparc64_2hv_&
831 : &PRECISION &
832 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
833 : #endif
834 : enddo
835 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */
836 :
837 : #ifndef WITH_FIXED_REAL_KERNEL
838 : endif
839 : #endif /* not WITH_FIXED_REAL_KERNEL */
840 : #endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
841 :
842 : #endif /* REALCASE == 1 */
843 :
844 :
845 : #if REALCASE == 1
846 : ! implementation of vsx block 2 real case
847 : #if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
848 :
849 : #ifndef WITH_FIXED_REAL_KERNEL
850 : if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
851 :
852 : #endif /* not WITH_FIXED_REAL_KERNEL */
853 :
854 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
855 : do j = ncols, 2, -2
856 : w(:,1) = bcast_buffer(1:nbw,j+off)
857 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
858 : #ifdef WITH_OPENMP
859 : call double_hh_trafo_&
860 : &MATH_DATATYPE&
861 : &_vsx_2hv_&
862 : &PRECISION &
863 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
864 : #else
865 : call double_hh_trafo_&
866 : &MATH_DATATYPE&
867 : &_vsx_2hv_&
868 : &PRECISION &
869 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
870 : #endif
871 : enddo
872 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
873 :
874 : #ifndef WITH_FIXED_REAL_KERNEL
875 : endif
876 : #endif /* not WITH_FIXED_REAL_KERNEL */
877 : #endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
878 :
879 : #endif /* REALCASE == 1 */
880 :
881 : #if REALCASE == 1
882 : ! implementation of sse block 2 real case
883 : #if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
884 :
885 : #ifndef WITH_FIXED_REAL_KERNEL
886 987840 : if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
887 :
888 : #endif /* not WITH_FIXED_REAL_KERNEL */
889 :
890 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
891 510720 : do j = ncols, 2, -2
892 466944 : w(:,1) = bcast_buffer(1:nbw,j+off)
893 466944 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
894 : #ifdef WITH_OPENMP
895 : call double_hh_trafo_&
896 : &MATH_DATATYPE&
897 : &_sse_2hv_&
898 : &PRECISION &
899 233472 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
900 : #else
901 : call double_hh_trafo_&
902 : &MATH_DATATYPE&
903 : &_sse_2hv_&
904 : &PRECISION &
905 233472 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
906 : #endif
907 : enddo
908 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
909 :
910 : #ifndef WITH_FIXED_REAL_KERNEL
911 : endif
912 : #endif /* not WITH_FIXED_REAL_KERNEL */
913 : #endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
914 :
915 : #endif /* REALCASE == 1 */
916 :
917 :
918 : #if COMPLEXCASE == 1
919 : ! implementation of sparc64 block 2 complex case
920 :
921 : #if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
922 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
923 : ! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
924 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
925 : !
926 : ! ttt = mpi_wtime()
927 : ! do j = ncols, 2, -2
928 : ! w(:,1) = bcast_buffer(1:nbw,j+off)
929 : ! w(:,2) = bcast_buffer(1:nbw,j+off-1)
930 : !#ifdef WITH_OPENMP
931 : ! call double_hh_trafo_&
932 : ! &MATH_DATATYPE&
933 : ! &_sparc64_2hv_&
934 : ! &PRECISION&
935 : ! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
936 : !#else
937 : ! call double_hh_trafo_&
938 : ! &MATH_DATATYPE&
939 : ! &_sparc64_2hv_&
940 : ! &PRECISION&
941 : ! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
942 : !#endif
943 : ! enddo
944 : !#ifdef WITH_OPENMP
945 : ! if (j==1) call single_hh_trafo_&
946 : ! &MATH_DATATYPE&
947 : ! &_sparc64_1hv_&
948 : ! &PRECISION&
949 : ! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
950 : !#else
951 : ! if (j==1) call single_hh_trafo_&
952 : ! &MATH_DATATYPE&
953 : ! &_sparc64_1hv_&
954 : ! &PRECISION&
955 : ! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
956 : !#endif
957 : !
958 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
959 : ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
960 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
961 : #endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
962 : #endif /* COMPLEXCASE == 1 */
963 :
964 :
965 : #if COMPLEXCASE == 1
966 : ! implementation of vsx block 2 complex case
967 :
968 : #if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
969 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
970 : ! if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
971 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
972 : !
973 : ! ttt = mpi_wtime()
974 : ! do j = ncols, 2, -2
975 : ! w(:,1) = bcast_buffer(1:nbw,j+off)
976 : ! w(:,2) = bcast_buffer(1:nbw,j+off-1)
977 : !#ifdef WITH_OPENMP
978 : ! call double_hh_trafo_&
979 : ! &MATH_DATATYPE&
980 : ! &_vsx_2hv_&
981 : ! &PRECISION&
982 : ! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
983 : !#else
984 : ! call double_hh_trafo_&
985 : ! &MATH_DATATYPE&
986 : ! &_vsx_2hv_&
987 : ! &PRECISION&
988 : ! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
989 : !#endif
990 : ! enddo
991 : !#ifdef WITH_OPENMP
992 : ! if (j==1) call single_hh_trafo_&
993 : ! &MATH_DATATYPE&
994 : ! &_vsx_1hv_&
995 : ! &PRECISION&
996 : ! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
997 : !#else
998 : ! if (j==1) call single_hh_trafo_&
999 : ! &MATH_DATATYPE&
1000 : ! &_vsx_1hv_&
1001 : ! &PRECISION&
1002 : ! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1003 : !#endif
1004 : !
1005 : !#ifndef WITH_FIXED_COMPLEX_KERNEL
1006 : ! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
1007 : !#endif /* not WITH_FIXED_COMPLEX_KERNEL */
1008 : #endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
1009 : #endif /* COMPLEXCASE == 1 */
1010 :
1011 : #if COMPLEXCASE == 1
1012 : ! implementation of sse block 2 complex case
1013 :
1014 : #if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
1015 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1016 2197440 : if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
1017 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1018 :
1019 72576 : ttt = mpi_wtime()
1020 808704 : do j = ncols, 2, -2
1021 736128 : w(:,1) = bcast_buffer(1:nbw,j+off)
1022 736128 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1023 : #ifdef WITH_OPENMP
1024 : call double_hh_trafo_&
1025 : &MATH_DATATYPE&
1026 : &_sse_2hv_&
1027 : &PRECISION&
1028 368064 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1029 : #else
1030 : call double_hh_trafo_&
1031 : &MATH_DATATYPE&
1032 : &_sse_2hv_&
1033 : &PRECISION&
1034 368064 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1035 : #endif
1036 : enddo
1037 : #ifdef WITH_OPENMP
1038 36288 : if (j==1) call single_hh_trafo_&
1039 : &MATH_DATATYPE&
1040 : &_sse_1hv_&
1041 : &PRECISION&
1042 6912 : & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1043 : #else
1044 36288 : if (j==1) call single_hh_trafo_&
1045 : &MATH_DATATYPE&
1046 : &_sse_1hv_&
1047 : &PRECISION&
1048 6912 : & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1049 : #endif
1050 :
1051 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1052 : endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
1053 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1054 : #endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
1055 : #endif /* COMPLEXCASE == 1 */
1056 :
1057 : #if REALCASE == 1
1058 : ! implementation of avx block 2 real case
1059 :
1060 : #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
1061 : #ifndef WITH_FIXED_REAL_KERNEL
1062 :
1063 987840 : if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
1064 : (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2)) then
1065 :
1066 : #endif /* not WITH_FIXED_REAL_KERNEL */
1067 :
1068 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
1069 17778672 : do j = ncols, 2, -2
1070 17345232 : w(:,1) = bcast_buffer(1:nbw,j+off)
1071 17345232 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1072 : #ifdef WITH_OPENMP
1073 :
1074 : call double_hh_trafo_&
1075 : &MATH_DATATYPE&
1076 : &_avx_avx2_2hv_&
1077 : &PRECISION&
1078 8672616 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1079 : #else
1080 : call double_hh_trafo_&
1081 : &MATH_DATATYPE&
1082 : &_avx_avx2_2hv_&
1083 : &PRECISION&
1084 8672616 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1085 : #endif
1086 : enddo
1087 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
1088 :
1089 : #ifndef WITH_FIXED_REAL_KERNEL
1090 : endif
1091 : #endif /* not WITH_FIXED_REAL_KERNEL */
1092 : #endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */
1093 :
1094 : #endif /* REALCASE */
1095 :
1096 : #if COMPLEXCASE == 1
1097 :
1098 : ! implementation of avx block 2 complex case
1099 : #if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
1100 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1101 2197440 : if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
1102 : (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
1103 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1104 :
1105 145152 : ttt = mpi_wtime()
1106 1617408 : do j = ncols, 2, -2
1107 1472256 : w(:,1) = bcast_buffer(1:nbw,j+off)
1108 1472256 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1109 : #ifdef WITH_OPENMP
1110 : call double_hh_trafo_&
1111 : &MATH_DATATYPE&
1112 : &_avx_avx2_2hv_&
1113 : &PRECISION&
1114 736128 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1115 : #else
1116 : call double_hh_trafo_&
1117 : &MATH_DATATYPE&
1118 : &_avx_avx2_2hv_&
1119 : &PRECISION&
1120 736128 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1121 : #endif
1122 : enddo
1123 : #ifdef WITH_OPENMP
1124 72576 : if (j==1) call single_hh_trafo_&
1125 : &MATH_DATATYPE&
1126 : &_avx_avx2_1hv_&
1127 : &PRECISION&
1128 13824 : & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1129 : #else
1130 72576 : if (j==1) call single_hh_trafo_&
1131 : &MATH_DATATYPE&
1132 : &_avx_avx2_1hv_&
1133 : &PRECISION&
1134 13824 : & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1135 : #endif
1136 :
1137 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1138 : endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
1139 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1140 : #endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */
1141 :
1142 : #endif /* COMPLEXCASE */
1143 :
1144 : #if REALCASE == 1
1145 : ! implementation of avx512 block 2 real case
1146 :
1147 : #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
1148 : #ifndef WITH_FIXED_REAL_KERNEL
1149 :
1150 504864 : if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
1151 :
1152 : #endif /* not WITH_FIXED_REAL_KERNEL */
1153 :
1154 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
1155 17012592 : do j = ncols, 2, -2
1156 16644816 : w(:,1) = bcast_buffer(1:nbw,j+off)
1157 16644816 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1158 : #ifdef WITH_OPENMP
1159 :
1160 : call double_hh_trafo_&
1161 : &MATH_DATATYPE&
1162 : &_avx512_2hv_&
1163 : &PRECISION&
1164 8322408 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1165 : #else
1166 : call double_hh_trafo_&
1167 : &MATH_DATATYPE&
1168 : &_avx512_2hv_&
1169 : &PRECISION&
1170 8322408 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1171 : #endif
1172 : enddo
1173 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
1174 :
1175 : #ifndef WITH_FIXED_REAL_KERNEL
1176 : endif
1177 : #endif /* not WITH_FIXED_REAL_KERNEL */
1178 : #endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
1179 :
1180 : #endif /* REALCASE */
1181 :
1182 : #if COMPLEXCASE == 1
1183 :
1184 : ! implementation of avx512 block 2 complex case
1185 : #if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
1186 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1187 1135008 : if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) then
1188 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1189 :
1190 36288 : ttt = mpi_wtime()
1191 404352 : do j = ncols, 2, -2
1192 368064 : w(:,1) = bcast_buffer(1:nbw,j+off)
1193 368064 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1194 : #ifdef WITH_OPENMP
1195 : call double_hh_trafo_&
1196 : &MATH_DATATYPE&
1197 : &_avx512_2hv_&
1198 : &PRECISION&
1199 184032 : & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1200 : #else
1201 : call double_hh_trafo_&
1202 : &MATH_DATATYPE&
1203 : &_avx512_2hv_&
1204 : &PRECISION&
1205 184032 : & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1206 : #endif
1207 : enddo
1208 : #ifdef WITH_OPENMP
1209 18144 : if (j==1) call single_hh_trafo_&
1210 : &MATH_DATATYPE&
1211 : &_avx512_1hv_&
1212 : &PRECISION&
1213 3456 : & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1214 : #else
1215 18144 : if (j==1) call single_hh_trafo_&
1216 : &MATH_DATATYPE&
1217 : &_avx512_1hv_&
1218 : &PRECISION&
1219 3456 : & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1220 : #endif
1221 :
1222 : #ifndef WITH_FIXED_COMPLEX_KERNEL
1223 : endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
1224 : #endif /* not WITH_FIXED_COMPLEX_KERNEL */
1225 : #endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
1226 : #endif /* COMPLEXCASE */
1227 :
1228 :
1229 : #if REALCASE == 1
1230 :
1231 : #if defined(WITH_REAL_BGP_KERNEL)
1232 : #ifndef WITH_FIXED_REAL_KERNEL
1233 : if (kernel .eq. ELPA_2STAGE_REAL_BGP) then
1234 :
1235 : #endif /* not WITH_FIXED_REAL_KERNEL */
1236 : do j = ncols, 2, -2
1237 : w(:,1) = bcast_buffer(1:nbw,j+off)
1238 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1239 : #ifdef WITH_OPENMP
1240 : call double_hh_trafo_bgp_&
1241 : &PRECISION&
1242 : & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
1243 : #else
1244 : call double_hh_trafo_bgp_&
1245 : &PRECISION&
1246 : & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
1247 : #endif
1248 : enddo
1249 : #ifndef WITH_FIXED_REAL_KERNEL
1250 : endif
1251 : #endif /* not WITH_FIXED_REAL_KERNEL */
1252 : #endif /* WITH_REAL_BGP_KERNEL */
1253 :
1254 : #if defined(WITH_REAL_BGQ_KERNEL)
1255 : #ifndef WITH_FIXED_REAL_KERNEL
1256 : if (kernel .eq. ELPA_2STAGE_REAL_BGQ) then
1257 :
1258 : #endif /* not WITH_FIXED_REAL_KERNEL */
1259 : do j = ncols, 2, -2
1260 : w(:,1) = bcast_buffer(1:nbw,j+off)
1261 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1262 : #ifdef WITH_OPENMP
1263 : call double_hh_trafo_bgq_&
1264 : &PRECISION&
1265 : & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
1266 : #else
1267 : call double_hh_trafo_bgq_&PRECISION&
1268 : & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
1269 : #endif
1270 : enddo
1271 : #ifndef WITH_FIXED_REAL_KERNEL
1272 : endif
1273 : #endif /* not WITH_FIXED_REAL_KERNEL */
1274 : #endif /* WITH_REAL_BGQ_KERNEL */
1275 :
1276 : #endif /* REALCASE */
1277 :
1278 : #if COMPLEXCASE == 1
1279 : ! complex bgp/bgq kernel implemented
1280 : #endif
1281 :
1282 :
1283 : #if REALCASE == 1
1284 : #ifdef WITH_OPENMP
1285 493920 : if (j==1) call single_hh_trafo_&
1286 : &MATH_DATATYPE&
1287 : &_cpu_openmp_&
1288 : &PRECISION&
1289 : & (a(1:stripe_width, 1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
1290 39168 : bcast_buffer(1:nbw,off+1), nbw, nl,stripe_width)
1291 : #else
1292 493920 : if (j==1) call single_hh_trafo_&
1293 : &MATH_DATATYPE&
1294 : &_cpu_&
1295 : &PRECISION&
1296 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
1297 39168 : stripe_width)
1298 : #endif
1299 :
1300 : #endif /* REALCASE == 1 */
1301 :
1302 : #if REALCASE == 1
1303 : #ifndef WITH_FIXED_REAL_KERNEL
1304 : endif !
1305 : #endif /* not WITH_FIXED_REAL_KERNEL */
1306 : #endif /* REALCASE == 1 */
1307 :
1308 : #if REALCASE == 1
1309 : ! sparc64 block4 real kernel
1310 :
1311 : #if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
1312 : #ifndef WITH_FIXED_REAL_KERNEL
1313 : if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4) then
1314 :
1315 : #endif /* not WITH_FIXED_REAL_KERNEL */
1316 :
1317 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
1318 : ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
1319 : do j = ncols, 4, -4
1320 : w(:,1) = bcast_buffer(1:nbw,j+off)
1321 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1322 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1323 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1324 : #ifdef WITH_OPENMP
1325 : call quad_hh_trafo_&
1326 : &MATH_DATATYPE&
1327 : &_sparc64_4hv_&
1328 : &PRECISION&
1329 : & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1330 : #else
1331 : call quad_hh_trafo_&
1332 : &MATH_DATATYPE&
1333 : &_sparc64_4hv_&
1334 : &PRECISION&
1335 : & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1336 : #endif
1337 : enddo
1338 : do jj = j, 2, -2
1339 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1340 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1341 : #ifdef WITH_OPENMP
1342 : call double_hh_trafo_&
1343 : &MATH_DATATYPE&
1344 : &_sparc64_2hv_&
1345 : &PRECISION&
1346 : & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1347 : #else
1348 : call double_hh_trafo_&
1349 : &MATH_DATATYPE&
1350 : &_sparc64_2hv_&
1351 : &PRECISION&
1352 : & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1353 : #endif
1354 : enddo
1355 : #ifdef WITH_OPENMP
1356 : if (jj==1) call single_hh_trafo_&
1357 : &MATH_DATATYPE&
1358 : &_cpu_openmp_&
1359 : &PRECISION&
1360 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1361 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1362 : #else
1363 : if (jj==1) call single_hh_trafo_&
1364 : &MATH_DATATYPE&
1365 : &_cpu_&
1366 : &PRECISION&
1367 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1368 : #endif
1369 :
1370 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)) */
1371 :
1372 : #ifndef WITH_FIXED_REAL_KERNEL
1373 : endif
1374 : #endif /* not WITH_FIXED_REAL_KERNEL */
1375 : #endif /* WITH_REAL_SPARC64_BLOCK4_KERNEL */
1376 :
1377 : #endif /* REALCASE */
1378 :
1379 : #if REALCASE == 1
1380 : ! vsx block4 real kernel
1381 :
1382 : #if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
1383 : #ifndef WITH_FIXED_REAL_KERNEL
1384 : if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4) then
1385 :
1386 : #endif /* not WITH_FIXED_REAL_KERNEL */
1387 :
1388 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
1389 : ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
1390 : do j = ncols, 4, -4
1391 : w(:,1) = bcast_buffer(1:nbw,j+off)
1392 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1393 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1394 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1395 : #ifdef WITH_OPENMP
1396 : call quad_hh_trafo_&
1397 : &MATH_DATATYPE&
1398 : &_vsx_4hv_&
1399 : &PRECISION&
1400 : & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1401 : #else
1402 : call quad_hh_trafo_&
1403 : &MATH_DATATYPE&
1404 : &_vsx_4hv_&
1405 : &PRECISION&
1406 : & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1407 : #endif
1408 : enddo
1409 : do jj = j, 2, -2
1410 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1411 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1412 : #ifdef WITH_OPENMP
1413 : call double_hh_trafo_&
1414 : &MATH_DATATYPE&
1415 : &_vsx_2hv_&
1416 : &PRECISION&
1417 : & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1418 : #else
1419 : call double_hh_trafo_&
1420 : &MATH_DATATYPE&
1421 : &_vsx_2hv_&
1422 : &PRECISION&
1423 : & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1424 : #endif
1425 : enddo
1426 : #ifdef WITH_OPENMP
1427 : if (jj==1) call single_hh_trafo_&
1428 : &MATH_DATATYPE&
1429 : &_cpu_openmp_&
1430 : &PRECISION&
1431 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1432 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1433 : #else
1434 : if (jj==1) call single_hh_trafo_&
1435 : &MATH_DATATYPE&
1436 : &_cpu_&
1437 : &PRECISION&
1438 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1439 : #endif
1440 :
1441 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
1442 :
1443 : #ifndef WITH_FIXED_REAL_KERNEL
1444 : endif
1445 : #endif /* not WITH_FIXED_REAL_KERNEL */
1446 : #endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
1447 :
1448 : #endif /* REALCASE */
1449 :
1450 : #if REALCASE == 1
1451 : ! sse block4 real kernel
1452 :
1453 : #if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
1454 : #ifndef WITH_FIXED_REAL_KERNEL
1455 1294272 : if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK4) then
1456 :
1457 : #endif /* not WITH_FIXED_REAL_KERNEL */
1458 :
1459 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
1460 : ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
1461 269952 : do j = ncols, 4, -4
1462 226176 : w(:,1) = bcast_buffer(1:nbw,j+off)
1463 226176 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1464 226176 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1465 226176 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1466 : #ifdef WITH_OPENMP
1467 : call quad_hh_trafo_&
1468 : &MATH_DATATYPE&
1469 : &_sse_4hv_&
1470 : &PRECISION&
1471 113088 : & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1472 : #else
1473 : call quad_hh_trafo_&
1474 : &MATH_DATATYPE&
1475 : &_sse_4hv_&
1476 : &PRECISION&
1477 113088 : & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1478 : #endif
1479 : enddo
1480 58368 : do jj = j, 2, -2
1481 14592 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1482 14592 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1483 : #ifdef WITH_OPENMP
1484 : call double_hh_trafo_&
1485 : &MATH_DATATYPE&
1486 : &_sse_2hv_&
1487 : &PRECISION&
1488 7296 : & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1489 : #else
1490 : call double_hh_trafo_&
1491 : &MATH_DATATYPE&
1492 : &_sse_2hv_&
1493 : &PRECISION&
1494 7296 : & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1495 : #endif
1496 : enddo
1497 : #ifdef WITH_OPENMP
1498 21888 : if (jj==1) call single_hh_trafo_&
1499 : &MATH_DATATYPE&
1500 : &_cpu_openmp_&
1501 : &PRECISION&
1502 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1503 3648 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1504 : #else
1505 21888 : if (jj==1) call single_hh_trafo_&
1506 : &MATH_DATATYPE&
1507 : &_cpu_&
1508 : &PRECISION&
1509 3648 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1510 : #endif
1511 :
1512 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
1513 :
1514 : #ifndef WITH_FIXED_REAL_KERNEL
1515 : endif
1516 : #endif /* not WITH_FIXED_REAL_KERNEL */
1517 : #endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
1518 :
1519 : #endif /* REALCASE */
1520 :
1521 : #if COMPLEXCASE == 1
1522 : !no sse block4 complex kernel
1523 : #endif /* COMPLEXCASE */
1524 :
1525 : #if REALCASE == 1
1526 : ! avx block4 real kernel
1527 : #if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
1528 : #ifndef WITH_FIXED_REAL_KERNEL
1529 1294272 : if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK4) .or. &
1530 : (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK4)) then
1531 :
1532 : #endif /* not WITH_FIXED_REAL_KERNEL */
1533 :
1534 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
1535 : ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
1536 539904 : do j = ncols, 4, -4
1537 452352 : w(:,1) = bcast_buffer(1:nbw,j+off)
1538 452352 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1539 452352 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1540 452352 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1541 : #ifdef WITH_OPENMP
1542 : call quad_hh_trafo_&
1543 : &MATH_DATATYPE&
1544 : &_avx_avx2_4hv_&
1545 : &PRECISION&
1546 226176 : & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1547 : #else
1548 : call quad_hh_trafo_&
1549 : &MATH_DATATYPE&
1550 : &_avx_avx2_4hv_&
1551 : &PRECISION&
1552 226176 : & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1553 : #endif
1554 : enddo
1555 116736 : do jj = j, 2, -2
1556 29184 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1557 29184 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1558 : #ifdef WITH_OPENMP
1559 : call double_hh_trafo_&
1560 : &MATH_DATATYPE&
1561 : &_avx_avx2_2hv_&
1562 : &PRECISION&
1563 14592 : & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)),w, nbw, nl, stripe_width, nbw)
1564 : #else
1565 : call double_hh_trafo_&
1566 : &MATH_DATATYPE&
1567 : &_avx_avx2_2hv_&
1568 : &PRECISION&
1569 14592 : & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1570 : #endif
1571 : enddo
1572 : #ifdef WITH_OPENMP
1573 43776 : if (jj==1) call single_hh_trafo_&
1574 : &MATH_DATATYPE&
1575 : &_cpu_openmp_&
1576 : &PRECISION&
1577 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1578 7296 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1579 : #else
1580 43776 : if (jj==1) call single_hh_trafo_&
1581 : &MATH_DATATYPE&
1582 : &_cpu_&
1583 : &PRECISION&
1584 7296 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1585 : #endif
1586 :
1587 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
1588 :
1589 : #ifndef WITH_FIXED_REAL_KERNEL
1590 : endif
1591 : #endif /* not WITH_FIXED_REAL_KERNEL */
1592 : #endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
1593 :
1594 : #endif /* REALCASE */
1595 :
1596 : #if COMPLEXCASE == 1
1597 : !no avx block4 complex kernel
1598 : #endif
1599 :
1600 : #if REALCASE == 1
1601 : ! avx512 block4 real kernel
1602 :
1603 : #if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
1604 : #ifndef WITH_FIXED_REAL_KERNEL
1605 679968 : if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4) then
1606 : #endif /* not WITH_FIXED_REAL_KERNEL */
1607 :
1608 : #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
1609 : ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
1610 134976 : do j = ncols, 4, -4
1611 113088 : w(:,1) = bcast_buffer(1:nbw,j+off)
1612 113088 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1613 113088 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1614 113088 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1615 : #ifdef WITH_OPENMP
1616 : call quad_hh_trafo_&
1617 : &MATH_DATATYPE&
1618 : &_avx512_4hv_&
1619 : &PRECISION&
1620 56544 : & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1621 : #else
1622 : call quad_hh_trafo_&
1623 : &MATH_DATATYPE&
1624 : &_avx512_4hv_&
1625 : &PRECISION&
1626 56544 : & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1627 : #endif
1628 : enddo
1629 29184 : do jj = j, 2, -2
1630 7296 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1631 7296 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1632 : #ifdef WITH_OPENMP
1633 : call double_hh_trafo_&
1634 : &MATH_DATATYPE&
1635 : &_avx512_2hv_&
1636 : &PRECISION&
1637 3648 : & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1638 : #else
1639 : call double_hh_trafo_&
1640 : &MATH_DATATYPE&
1641 : &_avx512_2hv_&
1642 : &PRECISION&
1643 3648 : & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1644 : #endif
1645 : enddo
1646 : #ifdef WITH_OPENMP
1647 10944 : if (jj==1) call single_hh_trafo_&
1648 : &MATH_DATATYPE&
1649 : &_cpu_openmp_&
1650 : &PRECISION&
1651 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1652 1824 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1653 : #else
1654 10944 : if (jj==1) call single_hh_trafo_&
1655 : &MATH_DATATYPE&
1656 : &_cpu_&
1657 : &PRECISION&
1658 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
1659 1824 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1660 : #endif
1661 :
1662 : #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
1663 :
1664 : #ifndef WITH_FIXED_REAL_KERNEL
1665 : endif
1666 : #endif /* not WITH_FIXED_REAL_KERNEL */
1667 : #endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
1668 :
1669 : #endif /* REALCASE */
1670 :
1671 : #if COMPLEXCASE == 1
1672 : !no avx512 block4 complex kernel
1673 : #endif /* COMPLEXCASE */
1674 :
1675 :
1676 : #if REALCASE == 1
1677 : !sparc64 block6 real kernel
1678 : #if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
1679 : #ifndef WITH_FIXED_REAL_KERNEL
1680 : if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6) then
1681 :
1682 : #endif /* not WITH_FIXED_REAL_KERNEL */
1683 : ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
1684 : do j = ncols, 6, -6
1685 : w(:,1) = bcast_buffer(1:nbw,j+off)
1686 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1687 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1688 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1689 : w(:,5) = bcast_buffer(1:nbw,j+off-4)
1690 : w(:,6) = bcast_buffer(1:nbw,j+off-5)
1691 : #ifdef WITH_OPENMP
1692 : call hexa_hh_trafo_&
1693 : &MATH_DATATYPE&
1694 : &_sparc64_6hv_&
1695 : &PRECISION&
1696 : & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1697 : #else
1698 : call hexa_hh_trafo_&
1699 : &MATH_DATATYPE&
1700 : &_sparc64_6hv_&
1701 : &PRECISION&
1702 : & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
1703 : #endif
1704 : enddo
1705 : do jj = j, 4, -4
1706 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1707 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1708 : w(:,3) = bcast_buffer(1:nbw,jj+off-2)
1709 : w(:,4) = bcast_buffer(1:nbw,jj+off-3)
1710 : #ifdef WITH_OPENMP
1711 : call quad_hh_trafo_&
1712 : &MATH_DATATYPE&
1713 : &_sparc64_4hv_&
1714 : &PRECISION&
1715 : & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1716 : #else
1717 : call quad_hh_trafo_&
1718 : &MATH_DATATYPE&
1719 : &_sparc64_4hv_&
1720 : &PRECISION&
1721 : & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
1722 : nbw, nl, stripe_width, nbw)
1723 : #endif
1724 : enddo
1725 : do jjj = jj, 2, -2
1726 : w(:,1) = bcast_buffer(1:nbw,jjj+off)
1727 : w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
1728 : #ifdef WITH_OPENMP
1729 : call double_hh_trafo_&
1730 : &MATH_DATATYPE&
1731 : &_sparc64_2hv_&
1732 : &PRECISION&
1733 : & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1734 : #else
1735 : call double_hh_trafo_&
1736 : &MATH_DATATYPE&
1737 : &_sparc64_2hv_&
1738 : &PRECISION&
1739 : & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1740 : #endif
1741 : enddo
1742 : #ifdef WITH_OPENMP
1743 : if (jjj==1) call single_hh_trafo_&
1744 : &MATH_DATATYPE&
1745 : &_cpu_openmp_&
1746 : &PRECISION&
1747 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1748 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1749 : #else
1750 : if (jjj==1) call single_hh_trafo_&
1751 : &MATH_DATATYPE&
1752 : &_cpu_&
1753 : &PRECISION&
1754 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1755 : #endif
1756 : #ifndef WITH_FIXED_REAL_KERNEL
1757 : endif
1758 : #endif /* not WITH_FIXED_REAL_KERNEL */
1759 : #endif /* WITH_REAL_SPARC64_BLOCK6_KERNEL */
1760 :
1761 : #endif /* REALCASE */
1762 :
1763 : #if REALCASE == 1
1764 : !vsx block6 real kernel
1765 : #if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
1766 : #ifndef WITH_FIXED_REAL_KERNEL
1767 : if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6) then
1768 :
1769 : #endif /* not WITH_FIXED_REAL_KERNEL */
1770 : ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
1771 : do j = ncols, 6, -6
1772 : w(:,1) = bcast_buffer(1:nbw,j+off)
1773 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1774 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1775 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1776 : w(:,5) = bcast_buffer(1:nbw,j+off-4)
1777 : w(:,6) = bcast_buffer(1:nbw,j+off-5)
1778 : #ifdef WITH_OPENMP
1779 : call hexa_hh_trafo_&
1780 : &MATH_DATATYPE&
1781 : &_vsx_6hv_&
1782 : &PRECISION&
1783 : & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1784 : #else
1785 : call hexa_hh_trafo_&
1786 : &MATH_DATATYPE&
1787 : &_vsx_6hv_&
1788 : &PRECISION&
1789 : & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
1790 : #endif
1791 : enddo
1792 : do jj = j, 4, -4
1793 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1794 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1795 : w(:,3) = bcast_buffer(1:nbw,jj+off-2)
1796 : w(:,4) = bcast_buffer(1:nbw,jj+off-3)
1797 : #ifdef WITH_OPENMP
1798 : call quad_hh_trafo_&
1799 : &MATH_DATATYPE&
1800 : &_vsx_4hv_&
1801 : &PRECISION&
1802 : & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1803 : #else
1804 : call quad_hh_trafo_&
1805 : &MATH_DATATYPE&
1806 : &_vsx_4hv_&
1807 : &PRECISION&
1808 : & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
1809 : nbw, nl, stripe_width, nbw)
1810 : #endif
1811 : enddo
1812 : do jjj = jj, 2, -2
1813 : w(:,1) = bcast_buffer(1:nbw,jjj+off)
1814 : w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
1815 : #ifdef WITH_OPENMP
1816 : call double_hh_trafo_&
1817 : &MATH_DATATYPE&
1818 : &_vsx_2hv_&
1819 : &PRECISION&
1820 : & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1821 : #else
1822 : call double_hh_trafo_&
1823 : &MATH_DATATYPE&
1824 : &_vsx_2hv_&
1825 : &PRECISION&
1826 : & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1827 : #endif
1828 : enddo
1829 : #ifdef WITH_OPENMP
1830 : if (jjj==1) call single_hh_trafo_&
1831 : &MATH_DATATYPE&
1832 : &_cpu_openmp_&
1833 : &PRECISION&
1834 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1835 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1836 : #else
1837 : if (jjj==1) call single_hh_trafo_&
1838 : &MATH_DATATYPE&
1839 : &_cpu_&
1840 : &PRECISION&
1841 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1842 : #endif
1843 : #ifndef WITH_FIXED_REAL_KERNEL
1844 : endif
1845 : #endif /* not WITH_FIXED_REAL_KERNEL */
1846 : #endif /* WITH_REAL_VSX_BLOCK6_KERNEL */
1847 :
1848 : #endif /* REALCASE */
1849 :
1850 : #if REALCASE == 1
1851 : !sse block6 real kernel
1852 : #if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
1853 : #ifndef WITH_FIXED_REAL_KERNEL
1854 1294272 : if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK6) then
1855 :
1856 : #endif /* not WITH_FIXED_REAL_KERNEL */
1857 : ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
1858 189696 : do j = ncols, 6, -6
1859 145920 : w(:,1) = bcast_buffer(1:nbw,j+off)
1860 145920 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1861 145920 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1862 145920 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1863 145920 : w(:,5) = bcast_buffer(1:nbw,j+off-4)
1864 145920 : w(:,6) = bcast_buffer(1:nbw,j+off-5)
1865 : #ifdef WITH_OPENMP
1866 : call hexa_hh_trafo_&
1867 : &MATH_DATATYPE&
1868 : &_sse_6hv_&
1869 : &PRECISION&
1870 72960 : & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1871 : #else
1872 : call hexa_hh_trafo_&
1873 : &MATH_DATATYPE&
1874 : &_sse_6hv_&
1875 : &PRECISION&
1876 72960 : & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
1877 : #endif
1878 : enddo
1879 56544 : do jj = j, 4, -4
1880 12768 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1881 12768 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1882 12768 : w(:,3) = bcast_buffer(1:nbw,jj+off-2)
1883 12768 : w(:,4) = bcast_buffer(1:nbw,jj+off-3)
1884 : #ifdef WITH_OPENMP
1885 : call quad_hh_trafo_&
1886 : &MATH_DATATYPE&
1887 : &_sse_4hv_&
1888 : &PRECISION&
1889 6384 : & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1890 : #else
1891 : call quad_hh_trafo_&
1892 : &MATH_DATATYPE&
1893 : &_sse_4hv_&
1894 : &PRECISION&
1895 : & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
1896 6384 : nbw, nl, stripe_width, nbw)
1897 : #endif
1898 : enddo
1899 47424 : do jjj = jj, 2, -2
1900 3648 : w(:,1) = bcast_buffer(1:nbw,jjj+off)
1901 3648 : w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
1902 : #ifdef WITH_OPENMP
1903 : call double_hh_trafo_&
1904 : &MATH_DATATYPE&
1905 : &_sse_2hv_&
1906 : &PRECISION&
1907 1824 : & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1908 : #else
1909 : call double_hh_trafo_&
1910 : &MATH_DATATYPE&
1911 : &_sse_2hv_&
1912 : &PRECISION&
1913 1824 : & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1914 : #endif
1915 : enddo
1916 : #ifdef WITH_OPENMP
1917 21888 : if (jjj==1) call single_hh_trafo_&
1918 : &MATH_DATATYPE&
1919 : &_cpu_openmp_&
1920 : &PRECISION&
1921 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
1922 3648 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1923 : #else
1924 21888 : if (jjj==1) call single_hh_trafo_&
1925 : &MATH_DATATYPE&
1926 : &_cpu_&
1927 : &PRECISION&
1928 3648 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
1929 : #endif
1930 : #ifndef WITH_FIXED_REAL_KERNEL
1931 : endif
1932 : #endif /* not WITH_FIXED_REAL_KERNEL */
1933 : #endif /* WITH_REAL_SSE_BLOCK6_KERNEL */
1934 :
1935 : #endif /* REALCASE */
1936 :
1937 : #if COMPLEXCASE == 1
1938 : ! no sse block6 complex kernel
1939 : #endif
1940 :
1941 : #if REALCASE == 1
1942 : ! avx block6 real kernel
1943 :
1944 : #if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
1945 : #ifndef WITH_FIXED_REAL_KERNEL
1946 1294272 : if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK6) .or. &
1947 : (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK6)) then
1948 :
1949 : #endif /* not WITH_FIXED_REAL_KERNEL */
1950 : ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
1951 379392 : do j = ncols, 6, -6
1952 291840 : w(:,1) = bcast_buffer(1:nbw,j+off)
1953 291840 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
1954 291840 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
1955 291840 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
1956 291840 : w(:,5) = bcast_buffer(1:nbw,j+off-4)
1957 291840 : w(:,6) = bcast_buffer(1:nbw,j+off-5)
1958 : #ifdef WITH_OPENMP
1959 : call hexa_hh_trafo_&
1960 : &MATH_DATATYPE&
1961 : &_avx_avx2_6hv_&
1962 : &PRECISION&
1963 145920 : & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1964 : #else
1965 : call hexa_hh_trafo_&
1966 : &MATH_DATATYPE&
1967 : &_avx_avx2_6hv_&
1968 : &PRECISION&
1969 145920 : & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
1970 : #endif
1971 : enddo
1972 113088 : do jj = j, 4, -4
1973 25536 : w(:,1) = bcast_buffer(1:nbw,jj+off)
1974 25536 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
1975 25536 : w(:,3) = bcast_buffer(1:nbw,jj+off-2)
1976 25536 : w(:,4) = bcast_buffer(1:nbw,jj+off-3)
1977 : #ifdef WITH_OPENMP
1978 : call quad_hh_trafo_&
1979 : &MATH_DATATYPE&
1980 : &_avx_avx2_4hv_&
1981 : &PRECISION&
1982 12768 : & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1983 : #else
1984 : call quad_hh_trafo_&
1985 : &MATH_DATATYPE&
1986 : &_avx_avx2_4hv_&
1987 : &PRECISION&
1988 12768 : & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
1989 : #endif
1990 : enddo
1991 94848 : do jjj = jj, 2, -2
1992 7296 : w(:,1) = bcast_buffer(1:nbw,jjj+off)
1993 7296 : w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
1994 : #ifdef WITH_OPENMP
1995 : call double_hh_trafo_&
1996 : &MATH_DATATYPE&
1997 : &_avx_avx2_2hv_&
1998 : &PRECISION&
1999 3648 : & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
2000 : #else
2001 : call double_hh_trafo_&
2002 : &MATH_DATATYPE&
2003 : &_avx_avx2_2hv_&
2004 : &PRECISION&
2005 3648 : & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
2006 : #endif
2007 : enddo
2008 : #ifdef WITH_OPENMP
2009 43776 : if (jjj==1) call single_hh_trafo_&
2010 : &MATH_DATATYPE&
2011 : &_cpu_openmp_&
2012 : &PRECISION&
2013 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
2014 7296 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
2015 : #else
2016 43776 : if (jjj==1) call single_hh_trafo_&
2017 : &MATH_DATATYPE&
2018 : &_cpu_&
2019 : &PRECISION&
2020 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
2021 7296 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
2022 : #endif
2023 : #ifndef WITH_FIXED_REAL_KERNEL
2024 : endif
2025 : #endif /* not WITH_FIXED_REAL_KERNEL */
2026 : #endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
2027 :
2028 : #endif /* REALCASE */
2029 :
2030 : #if COMPLEXCASE == 1
2031 : !no avx block6 complex kernel
2032 : #endif
2033 :
2034 : #if REALCASE == 1
2035 : ! avx512 block6 kernel
2036 : #if defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
2037 : #ifndef WITH_FIXED_REAL_KERNEL
2038 679968 : if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6)) then
2039 : #endif /* not WITH_FIXED_REAL_KERNEL */
2040 : ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
2041 :
2042 94848 : do j = ncols, 6, -6
2043 72960 : w(:,1) = bcast_buffer(1:nbw,j+off)
2044 72960 : w(:,2) = bcast_buffer(1:nbw,j+off-1)
2045 72960 : w(:,3) = bcast_buffer(1:nbw,j+off-2)
2046 72960 : w(:,4) = bcast_buffer(1:nbw,j+off-3)
2047 72960 : w(:,5) = bcast_buffer(1:nbw,j+off-4)
2048 72960 : w(:,6) = bcast_buffer(1:nbw,j+off-5)
2049 : #ifdef WITH_OPENMP
2050 : call hexa_hh_trafo_&
2051 : &MATH_DATATYPE&
2052 : &_avx512_6hv_&
2053 : &PRECISION&
2054 36480 : & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
2055 : #else
2056 : call hexa_hh_trafo_&
2057 : &MATH_DATATYPE&
2058 : &_avx512_6hv_&
2059 : &PRECISION&
2060 36480 : & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
2061 : #endif
2062 : enddo
2063 28272 : do jj = j, 4, -4
2064 6384 : w(:,1) = bcast_buffer(1:nbw,jj+off)
2065 6384 : w(:,2) = bcast_buffer(1:nbw,jj+off-1)
2066 6384 : w(:,3) = bcast_buffer(1:nbw,jj+off-2)
2067 6384 : w(:,4) = bcast_buffer(1:nbw,jj+off-3)
2068 : #ifdef WITH_OPENMP
2069 : call quad_hh_trafo_&
2070 : &MATH_DATATYPE&
2071 : &_avx512_4hv_&
2072 : &PRECISION&
2073 3192 : & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
2074 : #else
2075 : call quad_hh_trafo_&
2076 : &MATH_DATATYPE&
2077 : &_avx512_4hv_&
2078 : &PRECISION&
2079 3192 : & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
2080 : #endif
2081 : enddo
2082 23712 : do jjj = jj, 2, -2
2083 1824 : w(:,1) = bcast_buffer(1:nbw,jjj+off)
2084 1824 : w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
2085 : #ifdef WITH_OPENMP
2086 : call double_hh_trafo_&
2087 : &MATH_DATATYPE&
2088 : &_avx512_2hv_&
2089 : &PRECISION&
2090 912 : & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
2091 : #else
2092 : call double_hh_trafo_&
2093 : &MATH_DATATYPE&
2094 : &_avx512_2hv_&
2095 : &PRECISION&
2096 912 : & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
2097 : #endif
2098 : enddo
2099 : #ifdef WITH_OPENMP
2100 10944 : if (jjj==1) call single_hh_trafo_&
2101 : &MATH_DATATYPE&
2102 : &_cpu_openmp_&
2103 : &PRECISION&
2104 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
2105 1824 : bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
2106 : #else
2107 10944 : if (jjj==1) call single_hh_trafo_&
2108 : &MATH_DATATYPE&
2109 : &_cpu_&
2110 : &PRECISION&
2111 1824 : & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
2112 : #endif
2113 : #ifndef WITH_FIXED_REAL_KERNEL
2114 : endif
2115 : #endif /* not WITH_FIXED_REAL_KERNEL */
2116 : #endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */
2117 :
2118 : #endif /* REALCASE */
2119 :
2120 : #if COMPLEXCASE == 1
2121 : !no avx512 block6 complex kernel
2122 : #endif /* COMPLEXCASE */
2123 :
2124 3491712 : if (wantDebug) then
2125 0 : call obj%timer%stop("compute_hh_trafo: CPU")
2126 : endif
2127 : endif ! GPU_KERNEL
2128 :
2129 : #ifdef WITH_OPENMP
2130 1745856 : if (my_thread==1) then
2131 : #endif
2132 3491712 : kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
2133 3491712 : kernel_time = kernel_time + mpi_wtime()-ttt
2134 3491712 : n_times = n_times + 1
2135 : #ifdef WITH_OPENMP
2136 : endif
2137 : #endif
2138 :
2139 3491712 : if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
2140 : &MATH_DATATYPE&
2141 : #ifdef WITH_OPENMP
2142 : &_openmp" // &
2143 : #else
2144 : &" // &
2145 : #endif
2146 : &PRECISION_SUFFIX &
2147 0 : )
2148 :
2149 3491712 : end subroutine
2150 :
2151 : ! vim: syntax=fortran
|