Line data Source code
1 : ! This file is part of ELPA.
2 : !
3 : ! The ELPA library was originally created by the ELPA consortium,
4 : ! consisting of the following organizations:
5 : !
6 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : ! This particular source code file contains additions, changes and
19 : ! enhancements authored by Intel Corporation which is not part of
20 : ! the ELPA consortium.
21 : !
22 : ! More information can be found here:
23 : ! http://elpa.mpcdf.mpg.de/
24 : !
25 : ! ELPA is free software: you can redistribute it and/or modify
26 : ! it under the terms of the version 3 of the license of the
27 : ! GNU Lesser General Public License as published by the Free
28 : ! Software Foundation.
29 : !
30 : ! ELPA is distributed in the hope that it will be useful,
31 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : ! GNU Lesser General Public License for more details.
34 : !
35 : ! You should have received a copy of the GNU Lesser General Public License
36 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : !
38 : ! ELPA reflects a substantial effort on the part of the original
39 : ! ELPA consortium, and we ask you to respect the spirit of the
40 : ! license that we chose: i.e., please contribute any changes you
41 : ! may have back to the original ELPA library distribution, and keep
42 : ! any derivatives of ELPA under the same license that we chose for
43 : ! the original distribution, the GNU Lesser General Public License.
44 : !
45 : !
46 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
47 : !
48 : ! Copyright of the original code rests with the authors inside the ELPA
49 : ! consortium. The copyright of any additional modifications shall rest
50 : ! with their original authors, but shall adhere to the licensing terms
51 : ! distributed along with the original code in the file "COPYING".
52 :
53 : #include "../general/sanity.F90"
54 :
55 : subroutine trans_ev_tridi_to_band_&
56 : &MATH_DATATYPE&
57 : &_&
58 46296 : &PRECISION &
59 46296 : (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, &
60 46296 : hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
61 : kernel)
62 :
63 : !-------------------------------------------------------------------------------
64 : ! trans_ev_tridi_to_band_real/complex:
65 : ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
66 : !
67 : ! Parameters
68 : !
69 : ! na Order of matrix a, number of rows of matrix q
70 : !
71 : ! nev Number eigenvectors to compute (= columns of matrix q)
72 : !
73 : ! nblk blocksize of cyclic distribution, must be the same in both directions!
74 : !
75 : ! nb semi bandwith
76 : !
77 : ! q On input: Eigenvectors of tridiagonal matrix
78 : ! On output: Transformed eigenvectors
79 : ! Distribution is like in Scalapack.
80 : !
81 : ! q_dev GPU device pointer to q
82 : !
83 : ! ldq Leading dimension of q
84 : ! matrixCols local columns of matrix q
85 : !
86 : ! mpi_comm_rows
87 : ! mpi_comm_cols
88 : ! MPI-Communicators for rows/columns/both
89 : !
90 : !-------------------------------------------------------------------------------
91 : use elpa_abstract_impl
92 : use elpa2_workload
93 : use pack_unpack_cpu
94 : use pack_unpack_gpu
95 : use compute_hh_trafo
96 : use cuda_functions
97 : use precision
98 : use iso_c_binding
99 : implicit none
100 : #include "../general/precision_kinds.F90"
101 : class(elpa_abstract_impl_t), intent(inout) :: obj
102 : logical, intent(in) :: useGPU
103 :
104 : integer(kind=ik), intent(in) :: kernel
105 : integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
106 :
107 : #ifdef USE_ASSUMED_SIZE
108 : MATH_DATATYPE(kind=rck) :: q(ldq,*)
109 : #else
110 : MATH_DATATYPE(kind=rck) :: q(ldq,matrixCols)
111 : #endif
112 :
113 : MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
114 : integer(kind=c_intptr_t) :: q_dev
115 :
116 : integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
117 : integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
118 : integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
119 : integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
120 : integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
121 : integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
122 : #ifdef WITH_OPENMP
123 : integer(kind=ik) :: thread_width, csw, b_off, b_len
124 : #endif
125 : integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
126 : integer(kind=ik) :: a_off, current_tv_off, max_blk_size
127 : integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
128 :
129 : logical :: flag
130 : #ifdef WITH_OPENMP
131 : MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:)
132 : #else
133 : MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:)
134 : #endif
135 : MATH_DATATYPE(kind=rck) :: a_var
136 :
137 : type(c_ptr) :: aIntern_ptr
138 :
139 46296 : MATH_DATATYPE(kind=rck) , allocatable :: row(:)
140 46296 : MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
141 :
142 : #ifdef WITH_OPENMP
143 23148 : MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
144 23148 : MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
145 : #else
146 23148 : MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
147 23148 : MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
148 : #endif
149 :
150 : integer(kind=c_intptr_t) :: aIntern_dev
151 : integer(kind=c_intptr_t) :: bcast_buffer_dev
152 : integer(kind=c_intptr_t) :: num
153 : integer(kind=c_intptr_t) :: dev_offset, dev_offset_1
154 : integer(kind=c_intptr_t) :: row_dev
155 : integer(kind=c_intptr_t) :: row_group_dev
156 : integer(kind=c_intptr_t) :: hh_tau_dev
157 : integer(kind=c_intptr_t) :: hh_dot_dev
158 : integer(kind=ik) :: row_group_size, unpack_idx
159 :
160 : integer(kind=ik) :: n_times
161 : integer(kind=ik) :: top, chunk, this_chunk
162 :
163 46296 : MATH_DATATYPE(kind=rck), allocatable :: result_buffer(:,:,:)
164 46296 : MATH_DATATYPE(kind=rck), allocatable :: bcast_buffer(:,:)
165 :
166 : integer(kind=ik) :: n_off
167 :
168 92592 : integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
169 92592 : integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
170 92592 : integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
171 :
172 : ! MPI send/recv tags, arbitrary
173 :
174 : integer(kind=ik), parameter :: bottom_recv_tag = 111
175 : integer(kind=ik), parameter :: top_recv_tag = 222
176 : integer(kind=ik), parameter :: result_recv_tag = 333
177 : #ifdef WITH_OPENMP
178 : integer(kind=ik) :: max_threads, my_thread
179 : integer(kind=ik) :: omp_get_max_threads
180 : #endif
181 :
182 :
183 : ! Just for measuring the kernel performance
184 : real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
185 : ! long integer
186 : integer(kind=lik) :: kernel_flops, kernel_flops_recv
187 :
188 : logical, intent(in) :: wantDebug
189 : logical :: success
190 : integer(kind=ik) :: istat, print_flops
191 : character(200) :: errorMessage
192 : logical :: successCUDA
193 : #ifndef WITH_MPI
194 : integer(kind=ik) :: j1
195 : #endif
196 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
197 : &PRECISION&
198 : &_&
199 : &MATH_DATATYPE
200 :
201 : call obj%timer%start("trans_ev_tridi_to_band_&
202 : &MATH_DATATYPE&
203 : &" // &
204 : &PRECISION_SUFFIX &
205 46296 : )
206 :
207 46296 : n_times = 0
208 46296 : if (useGPU) then
209 0 : unpack_idx = 0
210 0 : row_group_size = 0
211 : endif
212 :
213 46296 : success = .true.
214 46296 : kernel_time = 0.0
215 46296 : kernel_flops = 0
216 :
217 : #ifdef WITH_OPENMP
218 23148 : max_threads = 1
219 23148 : max_threads = omp_get_max_threads()
220 : #endif
221 46296 : if (wantDebug) call obj%timer%start("mpi_communication")
222 46296 : call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
223 46296 : call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
224 46296 : call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
225 46296 : call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
226 46296 : if (wantDebug) call obj%timer%stop("mpi_communication")
227 :
228 46296 : if (mod(nbw,nblk)/=0) then
229 0 : if (my_prow==0 .and. my_pcol==0) then
230 0 : if (wantDebug) then
231 : write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
232 : &MATH_DATATYPE&
233 0 : &: ERROR: nbw=',nbw,', nblk=',nblk
234 : write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
235 : &MATH_DATATYPE&
236 0 : &: band backtransform works only for nbw==n*nblk'
237 : endif
238 0 : success = .false.
239 0 : return
240 : endif
241 : endif
242 :
243 46296 : nfact = nbw / nblk
244 :
245 :
246 : ! local number of eigenvectors
247 46296 : l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
248 :
249 46296 : if (l_nev==0) then
250 : #ifdef WITH_OPENMP
251 0 : thread_width = 0
252 : #endif
253 0 : stripe_width = 0
254 0 : stripe_count = 0
255 0 : last_stripe_width = 0
256 :
257 : else ! l_nev
258 :
259 : #if WITH_OPENMP
260 : ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
261 : ! every primary cache
262 : ! Suggested stripe width is 48 - should this be reduced for the complex case ???
263 :
264 23148 : if (useGPU) then
265 0 : stripe_width = 256 ! Must be a multiple of 4
266 0 : stripe_count = (l_nev - 1) / stripe_width + 1
267 : else ! useGPU
268 : ! openmp only in non-GPU case
269 23148 : thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
270 : #if REALCASE == 1
271 : #ifdef DOUBLE_PRECISION_REAL
272 10080 : stripe_width = 48 ! Must be a multiple of 4
273 : #else
274 3852 : stripe_width = 96 ! Must be a multiple of 8
275 : #endif
276 : #endif /* REALCASE */
277 :
278 : #if COMPLEXCASE == 1
279 : #ifdef DOUBLE_PRECISION_COMPLEX
280 6192 : stripe_width = 48 ! Must be a multiple of 2
281 : #else
282 3024 : stripe_width = 48 ! Must be a multiple of 4
283 : #endif
284 : #endif /* COMPLEXCASE */
285 :
286 23148 : stripe_count = (thread_width-1)/stripe_width + 1
287 :
288 : ! Adapt stripe width so that last one doesn't get too small
289 :
290 23148 : stripe_width = (thread_width-1)/stripe_count + 1
291 :
292 : #if REALCASE == 1
293 : #ifdef DOUBLE_PRECISION_REAL
294 : if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
295 10080 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
296 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
297 :
298 1944 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
299 : ! (8 * sizeof(double) == 64)
300 :
301 : else
302 8136 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
303 : ! (4 * sizeof(double) == 32)
304 : endif
305 : #else
306 : if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
307 3852 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
308 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
309 :
310 :
311 756 : stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
312 : ! (16 * sizeof(float) == 64)
313 :
314 : else
315 3096 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
316 : ! (8 * sizeof(float) == 32)
317 : endif
318 : #endif
319 : #endif /* REALCASE */
320 :
321 : #if COMPLEXCASE == 1
322 : #ifdef DOUBLE_PRECISION_COMPLEX
323 6192 : if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
324 : kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
325 :
326 1296 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
327 : ! (4 * sizeof(double complex) == 64)
328 :
329 : else
330 :
331 4896 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
332 : ! (2 * sizeof(double complex) == 32)
333 : endif
334 : #else
335 :
336 3024 : if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
337 : kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
338 :
339 612 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
340 : ! (8 * sizeof(float complex) == 64)
341 :
342 : else
343 2412 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
344 : ! (4 * sizeof(float complex) == 32)
345 : endif
346 : #endif
347 : #endif /* COMPLEXCASE */
348 :
349 : #if REALCASE == 1
350 13932 : last_stripe_width = l_nev - (stripe_count-1)*stripe_width
351 : #endif
352 : #if COMPLEXCASE == 1
353 : ! only needed in no OMP case check thsis
354 : ! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
355 : #endif
356 :
357 : endif ! useGPU
358 :
359 : #else /* WITH_OPENMP */
360 :
361 : ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
362 : ! every primary cache
363 : ! Suggested stripe width is 48 - should this be reduced for the complex case ???
364 :
365 23148 : if (useGPU) then
366 0 : stripe_width = 256 ! Must be a multiple of 4
367 0 : stripe_count = (l_nev - 1) / stripe_width + 1
368 :
369 : else ! useGPU
370 : #if REALCASE == 1
371 : #ifdef DOUBLE_PRECISION_REAL
372 10080 : stripe_width = 48 ! Must be a multiple of 4
373 : #else
374 3852 : stripe_width = 96 ! Must be a multiple of 8
375 : #endif
376 : #endif /* REALCASE */
377 :
378 : #if COMPLEXCASE == 1
379 : #ifdef DOUBLE_PRECISION_COMPLEX
380 6192 : stripe_width = 48 ! Must be a multiple of 2
381 : #else
382 3024 : stripe_width = 48 ! Must be a multiple of 4
383 : #endif
384 : #endif /* COMPLEXCASE */
385 :
386 23148 : stripe_count = (l_nev-1)/stripe_width + 1
387 :
388 : ! Adapt stripe width so that last one doesn't get too small
389 :
390 23148 : stripe_width = (l_nev-1)/stripe_count + 1
391 :
392 : #if REALCASE == 1
393 : #ifdef DOUBLE_PRECISION_REAL
394 : if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
395 10080 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
396 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
397 :
398 1944 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
399 : ! (8 * sizeof(double) == 64)
400 :
401 : else
402 8136 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
403 : ! (4 * sizeof(double) == 32)
404 : endif
405 : #else
406 : if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
407 3852 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
408 : kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
409 :
410 :
411 756 : stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
412 : ! (16 * sizeof(float) == 64)
413 :
414 : else
415 3096 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
416 : ! (8 * sizeof(float) == 32)
417 : endif
418 : #endif
419 : #endif /* REALCASE */
420 :
421 : #if COMPLEXCASE == 1
422 : #ifdef DOUBLE_PRECISION_COMPLEX
423 :
424 6192 : if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
425 : kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
426 :
427 1296 : stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
428 : ! (4 * sizeof(double complex) == 64)
429 :
430 : else
431 :
432 4896 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
433 : ! (2 * sizeof(double complex) == 32)
434 : endif
435 : #else
436 :
437 3024 : if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
438 : kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
439 :
440 612 : stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
441 : ! (8 * sizeof(float complex) == 64)
442 :
443 : else
444 2412 : stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
445 : ! (4 * sizeof(float complex) == 32)
446 : endif
447 : #endif
448 : #endif /* COMPLEXCASE */
449 : endif ! useGPU
450 :
451 23148 : last_stripe_width = l_nev - (stripe_count-1)*stripe_width
452 :
453 : #endif /* WITH_OPENMP */
454 : endif ! l_nev
455 :
456 : ! Determine the matrix distribution at the beginning
457 :
458 46296 : allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
459 46296 : if (istat .ne. 0) then
460 : print *,"trans_ev_tridi_to_band_&
461 : &MATH_DATATYPE&
462 0 : &: error when allocating limits"//errorMessage
463 0 : stop 1
464 : endif
465 46296 : call determine_workload(obj,na, nbw, np_rows, limits)
466 :
467 46296 : max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
468 :
469 46296 : a_dim2 = max_blk_size + nbw
470 :
471 46296 : if (useGPU) then
472 0 : num = (stripe_width*a_dim2*stripe_count)* size_of_datatype
473 0 : successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
474 0 : if (.not.(successCUDA)) then
475 : print *,"trans_ev_tridi_to_band_&
476 : &MATH_DATATYPE&
477 0 : &: error in cudaMalloc"//errorMessage
478 0 : stop 1
479 : endif
480 :
481 0 : successCUDA = cuda_memset(aIntern_dev , 0, num)
482 0 : if (.not.(successCUDA)) then
483 : print *,"trans_ev_tridi_to_band_&
484 : &MATH_DATATYPE&
485 0 : &: error in cudaMemset"//errorMessage
486 0 : stop 1
487 : endif
488 :
489 0 : num = (l_nev)* size_of_datatype
490 0 : successCUDA = cuda_malloc( row_dev,num)
491 0 : if (.not.(successCUDA)) then
492 : print *,"trans_ev_tridi_to_band_&
493 : &MATH_DATATYPE&
494 0 : &: error in cudaMalloc "
495 0 : stop 1
496 : endif
497 :
498 0 : successCUDA = cuda_memset(row_dev , 0, num)
499 0 : if (.not.(successCUDA)) then
500 : print *,"trans_ev_tridi_to_band_&
501 : &MATH_DATATYPE&
502 0 : &: error in cudaMemset "
503 0 : stop 1
504 : endif
505 :
506 : ! "row_group" and "row_group_dev" are needed for GPU optimizations
507 0 : allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
508 0 : if (istat .ne. 0) then
509 : print *,"trans_ev_tridi_to_band_&
510 : &MATH_DATATYPE&
511 0 : &: error when allocating row_group"//errorMessage
512 0 : stop 1
513 : endif
514 :
515 0 : row_group(:, :) = 0.0_rck
516 0 : num = (l_nev*nblk)* size_of_datatype
517 0 : successCUDA = cuda_malloc(row_group_dev, num)
518 0 : if (.not.(successCUDA)) then
519 : print *,"trans_ev_tridi_to_band_&
520 : &MATH_DATATYPE&
521 0 : &: error in cudaMalloc"//errorMessage
522 0 : stop 1
523 : endif
524 :
525 0 : successCUDA = cuda_memset(row_group_dev , 0, num)
526 0 : if (.not.(successCUDA)) then
527 : print *,"trans_ev_tridi_to_band_&
528 : &MATH_DATATYPE&
529 0 : &: error in cudaMemset"//errorMessage
530 0 : stop 1
531 : endif
532 :
533 : else ! GPUs are not used
534 :
535 : #if 0
536 : ! realcase or complexcase
537 : !DEC$ ATTRIBUTES ALIGN: 64:: aIntern
538 : #endif
539 :
540 : #ifdef WITH_OPENMP
541 23148 : if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads* &
542 : C_SIZEOF(a_var)) /= 0) then
543 : print *,"trans_ev_tridi_to_band_&
544 : &MATH_DATATYPE&
545 0 : &: error when allocating aIntern"//errorMessage
546 0 : stop 1
547 : endif
548 :
549 23148 : call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
550 : ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
551 :
552 : ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
553 :
554 : #else /* WITH_OPENMP */
555 :
556 23148 : if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count* &
557 : C_SIZEOF(a_var)) /= 0) then
558 0 : print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
559 0 : stop 1
560 : endif
561 :
562 23148 : call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
563 : !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
564 :
565 23148 : aIntern(:,:,:) = 0.0_rck
566 : #endif /* WITH_OPENMP */
567 : endif !useGPU
568 :
569 46296 : allocate(row(l_nev), stat=istat, errmsg=errorMessage)
570 46296 : if (istat .ne. 0) then
571 : print *,"trans_ev_tridi_to_band_&
572 : &MATH_DATATYPE&
573 0 : &: error when allocating row"//errorMessage
574 0 : stop 1
575 : endif
576 :
577 46296 : row(:) = 0.0_rck
578 :
579 : ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
580 : ! and transpose the matrix using stripes of given stripe_width for cache blocking.
581 :
582 : ! The peculiar way it is done below is due to the fact that the last row should be
583 : ! ready first since it is the first one to start below
584 :
585 : #ifdef WITH_OPENMP
586 : ! Please note about the OMP usage below:
587 : ! This is not for speed, but because we want the matrix a in the memory and
588 : ! in the cache of the correct thread (if possible)
589 :
590 23148 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
591 46296 : !$omp parallel do private(my_thread), schedule(static, 1)
592 : do my_thread = 1, max_threads
593 23148 : aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
594 : enddo
595 : !$omp end parallel do
596 :
597 23148 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
598 : #endif /* WITH_OPENMP */
599 :
600 123456 : do ip = np_rows-1, 0, -1
601 77160 : if (my_prow == ip) then
602 : ! Receive my rows which have not yet been received
603 46296 : src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
604 5981496 : do i=limits(ip)+1,limits(ip+1)
605 5935200 : src = mod((i-1)/nblk, np_rows)
606 :
607 5935200 : if (src < my_prow) then
608 : #ifdef WITH_OPENMP
609 :
610 : #ifdef WITH_MPI
611 366516 : if (wantDebug) call obj%timer%start("mpi_communication")
612 : call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
613 366516 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
614 366516 : if (wantDebug) call obj%timer%stop("mpi_communication")
615 :
616 : #else /* WITH_MPI */
617 :
618 : ! row(1:l_nev) = row(1:l_nev)
619 :
620 : #endif /* WITH_MPI */
621 :
622 366516 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
623 :
624 733032 : !$omp parallel do private(my_thread), schedule(static, 1)
625 : do my_thread = 1, max_threads
626 : call unpack_row_&
627 : &MATH_DATATYPE&
628 : &_cpu_openmp_&
629 : &PRECISION &
630 : (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
631 366516 : thread_width, stripe_width, l_nev)
632 :
633 : enddo
634 : !$omp end parallel do
635 :
636 366516 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
637 :
638 : #else /* WITH_OPENMP */
639 366516 : if (useGPU) then
640 : ! An unpacking of the current row group may occur before queuing the next row
641 : call unpack_and_prepare_row_group_&
642 : &MATH_DATATYPE&
643 : &_gpu_&
644 : &PRECISION &
645 : ( &
646 : row_group, row_group_dev, aIntern_dev, stripe_count, &
647 : stripe_width, last_stripe_width, a_dim2, l_nev,&
648 : row_group_size, nblk, unpack_idx, &
649 0 : i - limits(ip), .false.)
650 : #ifdef WITH_MPI
651 0 : if (wantDebug) call obj%timer%start("mpi_communication")
652 : call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
653 0 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
654 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
655 :
656 : #else /* WITH_MPI */
657 0 : row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
658 : #endif /* WITH_MPI */
659 :
660 : else ! useGPU
661 : #ifdef WITH_MPI
662 366516 : if (wantDebug) call obj%timer%start("mpi_communication")
663 : call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
664 366516 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
665 366516 : if (wantDebug) call obj%timer%stop("mpi_communication")
666 :
667 : #else /* WITH_MPI */
668 :
669 : ! row(1:l_nev) = row(1:l_nev)
670 :
671 : #endif /* WITH_MPI */
672 :
673 : call unpack_row_&
674 : &MATH_DATATYPE&
675 : &_cpu_&
676 : &PRECISION &
677 366516 : (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
678 : endif ! useGPU
679 : #endif /* WITH_OPENMP */
680 :
681 5202168 : elseif (src == my_prow) then
682 :
683 4545528 : src_offset = src_offset+1
684 :
685 4545528 : if (useGPU) then
686 : #ifndef WITH_OPENMP
687 :
688 : ! An unpacking of the current row group may occur before queuing the next row
689 : call unpack_and_prepare_row_group_&
690 : &MATH_DATATYPE&
691 : &_gpu_&
692 : &PRECISION &
693 : ( &
694 : row_group, row_group_dev, aIntern_dev, stripe_count, &
695 : stripe_width, last_stripe_width, a_dim2, l_nev,&
696 : row_group_size, nblk, unpack_idx, &
697 0 : i - limits(ip), .false.)
698 :
699 0 : row_group(:, row_group_size) = q(src_offset, 1:l_nev)
700 : #else /* WITH_OPENMP */
701 :
702 : !#if COMPLEXCASE == 1
703 : !! why is an cuda call in the openmp region?
704 : ! call unpack_and_prepare_row_group_complex_gpu_&
705 : ! &PRECISION&
706 : ! &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
707 : ! last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
708 : ! unpack_idx, i - limits(ip),.false.)
709 : ! row_group(:, row_group_size) = q(src_offset, 1:l_nev)
710 : !#endif
711 :
712 : #endif /* not OpenMP */
713 : else
714 4545528 : row(:) = q(src_offset, 1:l_nev)
715 : endif
716 :
717 : #ifdef WITH_OPENMP
718 2272764 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
719 :
720 4545528 : !$omp parallel do private(my_thread), schedule(static, 1)
721 : do my_thread = 1, max_threads
722 : call unpack_row_&
723 : &MATH_DATATYPE&
724 : &_cpu_openmp_&
725 : &PRECISION &
726 2272764 : (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
727 :
728 : enddo
729 : !$omp end parallel do
730 :
731 2272764 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
732 :
733 : #else /* WITH_OPENMP */
734 :
735 2272764 : if (useGPU) then
736 :
737 : else
738 : call unpack_row_&
739 : &MATH_DATATYPE&
740 : &_cpu_&
741 : &PRECISION &
742 2272764 : (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
743 : endif
744 :
745 : #endif /* WITH_OPENMP */
746 :
747 : endif
748 : enddo
749 :
750 : ! Send all rows which have not yet been send
751 46296 : src_offset = 0
752 61728 : do dst = 0, ip-1
753 1499232 : do i=limits(dst)+1,limits(dst+1)
754 1483800 : if (mod((i-1)/nblk, np_rows) == my_prow) then
755 656640 : src_offset = src_offset+1
756 656640 : row(:) = q(src_offset, 1:l_nev)
757 :
758 : #ifdef WITH_MPI
759 656640 : if (wantDebug) call obj%timer%start("mpi_communication")
760 : call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
761 656640 : dst, 0, mpi_comm_rows, mpierr)
762 656640 : if (wantDebug) call obj%timer%stop("mpi_communication")
763 : #endif /* WITH_MPI */
764 : endif
765 : enddo
766 : enddo
767 :
768 30864 : else if (my_prow < ip) then
769 :
770 : ! Send all rows going to PE ip
771 15432 : src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
772 1499232 : do i=limits(ip)+1,limits(ip+1)
773 1483800 : src = mod((i-1)/nblk, np_rows)
774 1483800 : if (src == my_prow) then
775 733032 : src_offset = src_offset+1
776 733032 : row(:) = q(src_offset, 1:l_nev)
777 : #ifdef WITH_MPI
778 733032 : if (wantDebug) call obj%timer%start("mpi_communication")
779 : call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
780 733032 : ip, 0, mpi_comm_rows, mpierr)
781 733032 : if (wantDebug) call obj%timer%stop("mpi_communication")
782 : #endif /* WITH_MPI */
783 : endif
784 : enddo
785 :
786 : ! Receive all rows from PE ip
787 1499232 : do i=limits(my_prow)+1,limits(my_prow+1)
788 1483800 : src = mod((i-1)/nblk, np_rows)
789 1483800 : if (src == ip) then
790 : #ifdef WITH_OPENMP
791 :
792 : #ifdef WITH_MPI
793 328320 : if (wantDebug) call obj%timer%start("mpi_communication")
794 : call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
795 328320 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
796 328320 : if (wantDebug) call obj%timer%stop("mpi_communication")
797 : #else /* WITH_MPI */
798 :
799 : ! row(1:l_nev) = row(1:l_nev)
800 :
801 : #endif /* WITH_MPI */
802 :
803 328320 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
804 656640 : !$omp parallel do private(my_thread), schedule(static, 1)
805 : do my_thread = 1, max_threads
806 : call unpack_row_&
807 : &MATH_DATATYPE&
808 : &_cpu_openmp_&
809 : &PRECISION &
810 328320 : (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
811 : enddo
812 : !$omp end parallel do
813 328320 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
814 :
815 : #else /* WITH_OPENMP */
816 328320 : if (useGPU) then
817 : ! An unpacking of the current row group may occur before queuing the next row
818 : call unpack_and_prepare_row_group_&
819 : &MATH_DATATYPE&
820 : &_gpu_&
821 : &PRECISION&
822 : &( &
823 : row_group, row_group_dev, aIntern_dev, stripe_count, &
824 : stripe_width, last_stripe_width, a_dim2, l_nev, &
825 : row_group_size, nblk, unpack_idx, &
826 0 : i - limits(my_prow), .false.)
827 :
828 : #ifdef WITH_MPI
829 0 : if (wantDebug) call obj%timer%start("mpi_communication")
830 : call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
831 0 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
832 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
833 : #else /* WITH_MPI */
834 :
835 0 : row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
836 : #endif /* WITH_MPI */
837 :
838 : else ! useGPU
839 : #ifdef WITH_MPI
840 328320 : if (wantDebug) call obj%timer%start("mpi_communication")
841 : call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
842 328320 : src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
843 328320 : if (wantDebug) call obj%timer%stop("mpi_communication")
844 : #else /* WITH_MPI */
845 :
846 : ! row(1:l_nev) = row(1:l_nev)
847 :
848 : #endif
849 : call unpack_row_&
850 : &MATH_DATATYPE&
851 : &_cpu_&
852 : &PRECISION &
853 328320 : (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
854 : endif ! useGPU
855 :
856 : #endif /* WITH_OPENMP */
857 :
858 : endif
859 : enddo
860 : endif
861 : enddo
862 :
863 46296 : if (useGPU) then
864 : ! Force an unpacking of all remaining rows that haven't been unpacked yet
865 : call unpack_and_prepare_row_group_&
866 : &MATH_DATATYPE&
867 : &_gpu_&
868 : &PRECISION&
869 : &( &
870 : row_group, row_group_dev, aIntern_dev, stripe_count, &
871 : stripe_width, last_stripe_width, &
872 : a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
873 0 : -1, .true.)
874 :
875 0 : successCUDA = cuda_devicesynchronize()
876 :
877 0 : if (.not.(successCUDA)) then
878 : print *,"trans_ev_tridi_to_band_&
879 : &MATH_DATATYPE&
880 0 : &: error in cudaDeviceSynchronize"//errorMessage
881 0 : stop 1
882 : endif
883 : endif
884 :
885 : ! Set up result buffer queue
886 :
887 46296 : num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
888 :
889 46296 : num_result_buffers = 4*nfact
890 46296 : allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
891 46296 : if (istat .ne. 0) then
892 : print *,"trans_ev_tridi_to_band_&
893 : &MATH_DATATYPE&
894 0 : &: error when allocating result_buffer"//errorMessage
895 0 : stop 1
896 : endif
897 :
898 46296 : allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
899 46296 : if (istat .ne. 0) then
900 : print *,"trans_ev_tridi_to_band_&
901 : &MATH_DATATYPE&
902 0 : &: error when allocating result_send_request"//errorMessage
903 0 : stop 1
904 : endif
905 :
906 46296 : allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
907 46296 : if (istat .ne. 0) then
908 : print *,"trans_ev_tridi_to_band_&
909 : &MATH_DATATYPE&
910 0 : &: error when allocating result_recv_request"//errorMessage
911 0 : stop 1
912 : endif
913 :
914 : #ifdef WITH_MPI
915 30864 : result_send_request(:) = MPI_REQUEST_NULL
916 30864 : result_recv_request(:) = MPI_REQUEST_NULL
917 : #endif
918 :
919 : ! Queue up buffers
920 : #ifdef WITH_MPI
921 30864 : if (wantDebug) call obj%timer%start("mpi_communication")
922 :
923 30864 : if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
924 97680 : do j = 1, min(num_result_buffers, num_result_blocks)
925 : call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
926 82248 : 0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr)
927 : enddo
928 : endif
929 30864 : if (wantDebug) call obj%timer%stop("mpi_communication")
930 : #else /* WITH_MPI */
931 :
932 : ! carefull the "recv" has to be done at the corresponding wait or send
933 : ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)
934 :
935 : #endif /* WITH_MPI */
936 :
937 46296 : num_bufs_recvd = 0 ! No buffers received yet
938 :
939 : ! Initialize top/bottom requests
940 :
941 46296 : allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
942 46296 : if (istat .ne. 0) then
943 : print *,"trans_ev_tridi_to_band_&
944 : &MPI_DATATYPE&
945 0 : &: error when allocating top_send_request"//errorMessage
946 0 : stop 1
947 : endif
948 :
949 46296 : allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
950 46296 : if (istat .ne. 0) then
951 : print *,"trans_ev_tridi_to_band_&
952 : &MATH_DATATYPE&
953 0 : &: error when allocating top_recv_request"//errorMessage
954 0 : stop 1
955 : endif
956 :
957 46296 : allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
958 46296 : if (istat .ne. 0) then
959 : print *,"trans_ev_tridi_to_band_&
960 : &MATH_DATATYPE&
961 0 : &: error when allocating bottom_send_request"//errorMessage
962 0 : stop 1
963 : endif
964 :
965 46296 : allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
966 46296 : if (istat .ne. 0) then
967 : print *,"trans_ev_tridi_to_band_&
968 : &MATH_DATATYPE&
969 0 : &: error when allocating bottom_recv_request"//errorMessage
970 0 : stop 1
971 : endif
972 :
973 : #ifdef WITH_MPI
974 30864 : top_send_request(:) = MPI_REQUEST_NULL
975 30864 : top_recv_request(:) = MPI_REQUEST_NULL
976 30864 : bottom_send_request(:) = MPI_REQUEST_NULL
977 30864 : bottom_recv_request(:) = MPI_REQUEST_NULL
978 : #endif
979 :
980 : #ifdef WITH_OPENMP
981 23148 : allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
982 23148 : if (istat .ne. 0) then
983 : print *,"trans_ev_tridi_to_band_&
984 : &MATH_DATATYPE&
985 0 : &: error when allocating top_border_send_buffer"//errorMessage
986 0 : stop 1
987 : endif
988 :
989 23148 : allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
990 23148 : if (istat .ne. 0) then
991 : print *,"trans_ev_tridi_to_band_&
992 : &MATH_DATATYPE&
993 0 : &: error when allocating top_border_recv_buffer"//errorMessage
994 0 : stop 1
995 : endif
996 :
997 23148 : allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
998 23148 : if (istat .ne. 0) then
999 : print *,"trans_ev_tridi_to_band_&
1000 : &MATH_DATATYPE&
1001 0 : &: error when allocating bottom_border_send_buffer"//errorMessage
1002 0 : stop 1
1003 : endif
1004 :
1005 23148 : allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
1006 23148 : if (istat .ne. 0) then
1007 : print *,"trans_ev_tridi_to_band_&
1008 : &MATH_DATATYPE&
1009 0 : &: error when allocating bottom_border_recv_buffer"//errorMessage
1010 0 : stop 1
1011 : endif
1012 :
1013 23148 : top_border_send_buffer(:,:) = 0.0_rck
1014 23148 : top_border_recv_buffer(:,:) = 0.0_rck
1015 23148 : bottom_border_send_buffer(:,:) = 0.0_rck
1016 23148 : bottom_border_recv_buffer(:,:) = 0.0_rck
1017 : ! Initialize broadcast buffer
1018 :
1019 : #else /* WITH_OPENMP */
1020 :
1021 23148 : allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
1022 23148 : if (istat .ne. 0) then
1023 : print *,"trans_ev_tridi_to_band_&
1024 : &MATH_DATATYPE&
1025 0 : &: error when allocating top_border_send_bufer"//errorMessage
1026 0 : stop 1
1027 : endif
1028 :
1029 23148 : allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
1030 23148 : if (istat .ne. 0) then
1031 : print *,"trans_ev_tridi_to_band_&
1032 : &MATH_DATATYPE&
1033 0 : &: error when allocating top_border_recv_buffer"//errorMessage
1034 0 : stop 1
1035 : endif
1036 :
1037 23148 : allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
1038 23148 : if (istat .ne. 0) then
1039 : print *,"trans_ev_tridi_to_band_&
1040 : &MATH_DATATYPE&
1041 0 : &: error when allocating bottom_border_send_buffer"//errorMessage
1042 0 : stop 1
1043 : endif
1044 :
1045 23148 : allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
1046 23148 : if (istat .ne. 0) then
1047 : print *,"trans_ev_tridi_to_band_&
1048 : &MATH_DATATYPE&
1049 0 : &: error when allocating bottom_border_recv_buffer"//errorMessage
1050 0 : stop 1
1051 : endif
1052 :
1053 23148 : top_border_send_buffer(:,:,:) = 0.0_rck
1054 23148 : top_border_recv_buffer(:,:,:) = 0.0_rck
1055 23148 : bottom_border_send_buffer(:,:,:) = 0.0_rck
1056 23148 : bottom_border_recv_buffer(:,:,:) = 0.0_rck
1057 : #endif /* WITH_OPENMP */
1058 :
1059 : ! Initialize broadcast buffer
1060 :
1061 46296 : allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
1062 46296 : if (istat .ne. 0) then
1063 : print *,"trans_ev_tridi_to_band_&
1064 : &MATH_DATATYPE&
1065 0 : &: error when allocating bcast_buffer"//errorMessage
1066 0 : stop 1
1067 : endif
1068 :
1069 46296 : bcast_buffer = 0.0_rck
1070 46296 : if (useGPU) then
1071 0 : num = ( nbw * max_blk_size) * size_of_datatype
1072 0 : successCUDA = cuda_malloc(bcast_buffer_dev, num)
1073 0 : if (.not.(successCUDA)) then
1074 : print *,"trans_ev_tridi_to_band_&
1075 : &MATH_DATATYPE&
1076 0 : &: error in cudaMalloc"
1077 0 : stop 1
1078 : endif
1079 :
1080 0 : successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
1081 0 : if (.not.(successCUDA)) then
1082 : print *,"trans_ev_tridi_to_band_&
1083 : &MATH_DATATYPE&
1084 0 : &: error in cudaMemset"
1085 0 : stop 1
1086 : endif
1087 :
1088 0 : num = ((max_blk_size-1))* size_of_datatype
1089 0 : successCUDA = cuda_malloc( hh_dot_dev, num)
1090 0 : if (.not.(successCUDA)) then
1091 : print *,"trans_ev_tridi_to_band_&
1092 : &MATH_DATATYPE&
1093 0 : &: error in cudaMalloc"
1094 0 : stop 1
1095 : endif
1096 :
1097 0 : successCUDA = cuda_memset( hh_dot_dev, 0, num)
1098 0 : if (.not.(successCUDA)) then
1099 : print *,"trans_ev_tridi_to_band_&
1100 : &MATH_DATATYPE&
1101 0 : &: error in cudaMemset"
1102 0 : stop 1
1103 : endif
1104 :
1105 0 : num = (max_blk_size)* size_of_datatype
1106 0 : successCUDA = cuda_malloc( hh_tau_dev, num)
1107 0 : if (.not.(successCUDA)) then
1108 : print *,"trans_ev_tridi_to_band_&
1109 : &MATH_DATATYPE&
1110 0 : &: error in cudaMalloc"
1111 0 : stop 1
1112 : endif
1113 :
1114 0 : successCUDA = cuda_memset( hh_tau_dev, 0, num)
1115 0 : if (.not.(successCUDA)) then
1116 : print *,"trans_ev_tridi_to_band_&
1117 : &MATH_DATATYPE&
1118 0 : &: error in cudaMemset"
1119 0 : stop 1
1120 : endif
1121 : endif ! useGPU
1122 :
1123 46296 : current_tv_off = 0 ! Offset of next row to be broadcast
1124 :
1125 : ! ------------------- start of work loop -------------------
1126 :
1127 46296 : a_off = 0 ! offset in aIntern (to avoid unnecessary shifts)
1128 :
1129 46296 : top_msg_length = 0
1130 46296 : bottom_msg_length = 0
1131 :
1132 273312 : do sweep = 0, (na-1)/nbw
1133 :
1134 227016 : current_n = na - sweep*nbw
1135 227016 : call determine_workload(obj,current_n, nbw, np_rows, limits)
1136 227016 : current_n_start = limits(my_prow)
1137 227016 : current_n_end = limits(my_prow+1)
1138 227016 : current_local_n = current_n_end - current_n_start
1139 :
1140 227016 : next_n = max(current_n - nbw, 0)
1141 227016 : call determine_workload(obj,next_n, nbw, np_rows, limits)
1142 227016 : next_n_start = limits(my_prow)
1143 227016 : next_n_end = limits(my_prow+1)
1144 227016 : next_local_n = next_n_end - next_n_start
1145 :
1146 227016 : if (next_n_end < next_n) then
1147 44808 : bottom_msg_length = current_n_end - next_n_end
1148 : else
1149 182208 : bottom_msg_length = 0
1150 : endif
1151 :
1152 227016 : if (next_local_n > 0) then
1153 165288 : next_top_msg_length = current_n_start - next_n_start
1154 : else
1155 61728 : next_top_msg_length = 0
1156 : endif
1157 :
1158 227016 : if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
1159 : #ifdef WITH_MPI
1160 15432 : if (wantDebug) call obj%timer%start("mpi_communication")
1161 : #endif
1162 77112 : do i = 1, stripe_count
1163 :
1164 : #ifdef WITH_OPENMP
1165 :
1166 30840 : if (useGPU) then
1167 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
1168 0 : stop 1
1169 : endif
1170 :
1171 30840 : csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
1172 30840 : b_len = csw*nbw*max_threads
1173 : #ifdef WITH_MPI
1174 : call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1175 30840 : my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
1176 :
1177 : #else /* WITH_MPI */
1178 : ! carefull the "recieve" has to be done at the corresponding wait or send
1179 : ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
1180 : #endif /* WITH_MPI */
1181 :
1182 : #else /* WITH_OPENMP */
1183 :
1184 : #ifdef WITH_MPI
1185 : call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1186 30840 : my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
1187 : #else /* WITH_MPI */
1188 : ! carefull the recieve has to be done at the corresponding wait or send
1189 : ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
1190 : #endif /* WITH_MPI */
1191 :
1192 : #endif /* WITH_OPENMP */
1193 :
1194 : enddo
1195 : #if WITH_MPI
1196 15432 : if (wantDebug) call obj%timer%stop("mpi_communication")
1197 : #endif
1198 : endif
1199 :
1200 227016 : if (current_local_n > 1) then
1201 211584 : if (my_pcol == mod(sweep,np_cols)) then
1202 : bcast_buffer(:,1:current_local_n) = &
1203 211584 : hh_trans(:,current_tv_off+1:current_tv_off+current_local_n)
1204 211584 : current_tv_off = current_tv_off + current_local_n
1205 : endif
1206 :
1207 : #ifdef WITH_MPI
1208 135912 : if (wantDebug) call obj%timer%start("mpi_communication")
1209 : call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1210 135912 : mod(sweep,np_cols), mpi_comm_cols, mpierr)
1211 135912 : if (wantDebug) call obj%timer%stop("mpi_communication")
1212 :
1213 : #endif /* WITH_MPI */
1214 :
1215 211584 : if (useGPU) then
1216 : successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
1217 : nbw * current_local_n * &
1218 : size_of_datatype, &
1219 0 : cudaMemcpyHostToDevice)
1220 0 : if (.not.(successCUDA)) then
1221 : print *,"trans_ev_tridi_to_band_&
1222 : &MATH_DATATYPE&
1223 0 : &: error in cudaMemcpy"
1224 0 : stop 1
1225 : endif
1226 :
1227 : call extract_hh_tau_&
1228 : &MATH_DATATYPE&
1229 : &_gpu_&
1230 : &PRECISION &
1231 : !#if REALCASE == 1
1232 : (bcast_buffer_dev, hh_tau_dev, nbw, &
1233 : !#endif
1234 : !#if COMPLEXCASE == 1
1235 : ! ( nbw, &
1236 : !#endif
1237 0 : current_local_n, .false.)
1238 : call compute_hh_dot_products_&
1239 : &MATH_DATATYPE&
1240 : &_gpu_&
1241 : &PRECISION &
1242 : (bcast_buffer_dev, hh_dot_dev, nbw, &
1243 0 : current_local_n)
1244 : endif ! useGPU
1245 :
1246 : else ! (current_local_n > 1) then
1247 :
1248 : ! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
1249 15432 : bcast_buffer(:,1) = 0.0_rck
1250 15432 : if (useGPU) then
1251 0 : successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_datatype)
1252 0 : if (.not.(successCUDA)) then
1253 : print *,"trans_ev_tridi_to_band_&
1254 : &MATH_DATATYPE&
1255 0 : &: error in cudaMemset"
1256 0 : stop 1
1257 : endif
1258 :
1259 : call extract_hh_tau_&
1260 : &MATH_DATATYPE&
1261 : &_gpu_&
1262 : &PRECISION&
1263 : &( &
1264 : bcast_buffer_dev, hh_tau_dev, &
1265 0 : nbw, 1, .true.)
1266 : endif ! useGPU
1267 : endif ! (current_local_n > 1) then
1268 :
1269 227016 : if (l_nev == 0) cycle
1270 :
1271 227016 : if (current_local_n > 0) then
1272 :
1273 1375488 : do i = 1, stripe_count
1274 : #ifdef WITH_OPENMP
1275 581952 : if (useGPU) then
1276 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
1277 0 : stop 1
1278 : endif
1279 :
1280 : ! Get real stripe width for strip i;
1281 : ! The last OpenMP tasks may have an even smaller stripe with,
1282 : ! but we don't care about this, i.e. we send/recv a bit too much in this case.
1283 : ! csw: current_stripe_width
1284 :
1285 581952 : csw = min(stripe_width, thread_width-(i-1)*stripe_width)
1286 : #endif /* WITH_OPENMP */
1287 :
1288 : !wait_b
1289 1163904 : if (current_n_end < current_n) then
1290 :
1291 :
1292 : #ifdef WITH_OPENMP
1293 173424 : if (useGPU) then
1294 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
1295 0 : stop 1
1296 : endif
1297 :
1298 : #ifdef WITH_MPI
1299 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1300 :
1301 173424 : call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1302 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1303 : #endif
1304 173424 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
1305 346848 : !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
1306 : do my_thread = 1, max_threads
1307 173424 : n_off = current_local_n+a_off
1308 173424 : b_len = csw*nbw
1309 173424 : b_off = (my_thread-1)*b_len
1310 : aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
1311 346848 : reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
1312 : enddo
1313 : !$omp end parallel do
1314 173424 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
1315 : #else /* WITH_OPENMP */
1316 :
1317 : #ifdef WITH_MPI
1318 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1319 173424 : call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1320 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1321 :
1322 : #endif
1323 173424 : n_off = current_local_n+a_off
1324 :
1325 173424 : if (useGPU) then
1326 0 : dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
1327 : successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
1328 : stripe_width*nbw* size_of_datatype, &
1329 0 : cudaMemcpyHostToDevice)
1330 0 : if (.not.(successCUDA)) then
1331 : print *,"trans_ev_tridi_to_band_&
1332 : &MATH_DATATYPE&
1333 0 : &: error in cudaMemcpy"
1334 0 : stop 1
1335 : endif
1336 :
1337 : else
1338 173424 : aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
1339 : endif
1340 :
1341 : #endif /* WITH_OPENMP */
1342 :
1343 346848 : if (next_n_end < next_n) then
1344 :
1345 : #ifdef WITH_OPENMP
1346 :
1347 142584 : if (useGPU) then
1348 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
1349 0 : stop 1
1350 : endif
1351 : #ifdef WITH_MPI
1352 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1353 : call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1354 142584 : my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
1355 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1356 :
1357 : #else /* WTIH_MPI */
1358 : ! carefull the recieve has to be done at the corresponding wait or send
1359 : ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
1360 :
1361 : #endif /* WITH_MPI */
1362 :
1363 : #else /* WITH_OPENMP */
1364 :
1365 : #ifdef WITH_MPI
1366 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1367 : call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1368 142584 : my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
1369 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1370 :
1371 : #else /* WITH_MPI */
1372 :
1373 : !! carefull the recieve has to be done at the corresponding wait or send
1374 : !! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i)
1375 :
1376 : #endif /* WITH_MPI */
1377 :
1378 : #endif /* WITH_OPENMP */
1379 : endif
1380 : endif
1381 :
1382 1163904 : if (current_local_n <= bottom_msg_length + top_msg_length) then
1383 :
1384 : !wait_t
1385 0 : if (top_msg_length>0) then
1386 :
1387 : #ifdef WITH_OPENMP
1388 0 : if (useGPU) then
1389 : print *,"trans_ev_tridi_to_band_&
1390 : &MATH_DATATYPE&
1391 0 : &: not yet implemented"
1392 0 : stop 1
1393 : endif
1394 : #ifdef WITH_MPI
1395 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1396 :
1397 0 : call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1398 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1399 : #endif
1400 :
1401 : #else /* WITH_OPENMP */
1402 :
1403 : #ifdef WITH_MPI
1404 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1405 0 : call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1406 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1407 : #endif
1408 :
1409 0 : if (useGPU) then
1410 0 : dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
1411 : ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
1412 : successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
1413 : stripe_width*top_msg_length* size_of_datatype, &
1414 0 : cudaMemcpyHostToDevice)
1415 0 : if (.not.(successCUDA)) then
1416 : print *,"trans_ev_tridi_to_band_&
1417 : &MATH_DATATYPE&
1418 0 : &: error in cudaMemcpy"
1419 0 : stop 1
1420 : endif
1421 : else ! useGPU
1422 0 : aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
1423 : endif ! useGPU
1424 : #endif /* WITH_OPENMP */
1425 : endif ! top_msg_length
1426 :
1427 : !compute
1428 : #ifdef WITH_OPENMP
1429 :
1430 0 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
1431 :
1432 0 : !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
1433 : do my_thread = 1, max_threads
1434 0 : if (top_msg_length>0) then
1435 0 : b_len = csw*top_msg_length
1436 0 : b_off = (my_thread-1)*b_len
1437 : aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
1438 0 : reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
1439 : endif
1440 :
1441 : call compute_hh_trafo_&
1442 : &MATH_DATATYPE&
1443 : &_openmp_&
1444 : &PRECISION &
1445 : (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
1446 : l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1447 : #if REALCASE == 1
1448 : hh_dot_dev, &
1449 : #endif
1450 : hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
1451 0 : i, my_thread, thread_width, kernel)
1452 : enddo
1453 : !$omp end parallel do
1454 0 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
1455 :
1456 : #else /* WITH_OPENMP */
1457 :
1458 : call compute_hh_trafo_&
1459 : &MATH_DATATYPE&
1460 : &_&
1461 : &PRECISION&
1462 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
1463 : a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1464 : #if REALCASE == 1
1465 : hh_dot_dev, &
1466 : #endif
1467 : hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
1468 0 : last_stripe_width, kernel)
1469 : #endif /* WITH_OPENMP */
1470 :
1471 : !send_b 1
1472 : #ifdef WITH_MPI
1473 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1474 0 : call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
1475 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1476 : #endif
1477 :
1478 0 : if (bottom_msg_length>0) then
1479 0 : n_off = current_local_n+nbw-bottom_msg_length+a_off
1480 : #ifdef WITH_OPENMP
1481 0 : b_len = csw*bottom_msg_length*max_threads
1482 : bottom_border_send_buffer(1:b_len,i) = &
1483 0 : reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
1484 : #ifdef WITH_MPI
1485 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1486 : call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1487 0 : my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
1488 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1489 : #else /* WITH_MPI */
1490 0 : if (next_top_msg_length > 0) then
1491 : top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
1492 0 : next_top_msg_length*max_threads,i)
1493 : endif
1494 :
1495 : #endif /* WITH_MPI */
1496 :
1497 : !#if REALCASE == 1
1498 : endif ! this endif is not here in complex -case is for bottom_msg_length
1499 : !#endif
1500 :
1501 : #else /* WITH_OPENMP */
1502 :
1503 0 : if (useGPU) then
1504 0 : dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
1505 : successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
1506 : stripe_width * bottom_msg_length * size_of_datatype, &
1507 0 : cudaMemcpyDeviceToHost)
1508 0 : if (.not.(successCUDA)) then
1509 : print *,"trans_ev_tridi_to_band_&
1510 : &MATH_DATATYPE&
1511 0 : &: error in cudaMemcpy"
1512 0 : stop 1
1513 : endif
1514 : else
1515 0 : bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
1516 : endif
1517 : #ifdef WITH_MPI
1518 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1519 : call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, &
1520 0 : MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
1521 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1522 :
1523 : #else /* WITH_MPI */
1524 0 : if (next_top_msg_length > 0) then
1525 : top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
1526 0 : bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
1527 : endif
1528 :
1529 : #endif /* WITH_MPI */
1530 : endif
1531 : #endif /* WITH_OPENMP */
1532 :
1533 : else ! current_local_n <= bottom_msg_length + top_msg_length
1534 :
1535 : !compute
1536 : #ifdef WITH_OPENMP
1537 581952 : if (useGPU) then
1538 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
1539 0 : stop 1
1540 : endif
1541 581952 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
1542 :
1543 1163904 : !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
1544 : do my_thread = 1, max_threads
1545 :
1546 : call compute_hh_trafo_&
1547 : &MATH_DATATYPE&
1548 : &_openmp_&
1549 : &PRECISION&
1550 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
1551 : nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1552 : #if REALCASE == 1
1553 : hh_dot_dev, &
1554 : #endif
1555 : hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
1556 581952 : bottom_msg_length, i, my_thread, thread_width, kernel)
1557 : enddo
1558 : !$omp end parallel do
1559 581952 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
1560 :
1561 : !send_b
1562 : #ifdef WITH_MPI
1563 377688 : if (wantDebug) call obj%timer%start("mpi_communication")
1564 377688 : call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
1565 377688 : if (wantDebug) call obj%timer%stop("mpi_communication")
1566 : #endif
1567 581952 : if (bottom_msg_length > 0) then
1568 142584 : n_off = current_local_n+nbw-bottom_msg_length+a_off
1569 142584 : b_len = csw*bottom_msg_length*max_threads
1570 : bottom_border_send_buffer(1:b_len,i) = &
1571 142584 : reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
1572 : #ifdef WITH_MPI
1573 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1574 : call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1575 142584 : my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
1576 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1577 :
1578 : #else /* WITH_MPI */
1579 0 : if (next_top_msg_length > 0) then
1580 : top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
1581 : next_top_msg_length*&
1582 0 : max_threads,i)
1583 : endif
1584 : #endif /* WITH_MPI */
1585 : endif
1586 :
1587 : #else /* WITH_OPENMP */
1588 :
1589 : call compute_hh_trafo_&
1590 : &MATH_DATATYPE&
1591 : &_&
1592 : &PRECISION&
1593 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
1594 : a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1595 : #if REALCASE == 1
1596 : hh_dot_dev, &
1597 : #endif
1598 : hh_tau_dev, kernel_flops, kernel_time, n_times, &
1599 : current_local_n - bottom_msg_length, bottom_msg_length, i, &
1600 581952 : last_stripe_width, kernel)
1601 :
1602 :
1603 :
1604 : !send_b
1605 : #ifdef WITH_MPI
1606 377688 : if (wantDebug) call obj%timer%start("mpi_communication")
1607 :
1608 377688 : call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
1609 377688 : if (wantDebug) call obj%timer%stop("mpi_communication")
1610 : #endif
1611 581952 : if (bottom_msg_length > 0) then
1612 142584 : n_off = current_local_n+nbw-bottom_msg_length+a_off
1613 :
1614 142584 : if (useGPU) then
1615 0 : dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
1616 : successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
1617 : stripe_width*bottom_msg_length* size_of_datatype, &
1618 0 : cudaMemcpyDeviceToHost)
1619 0 : if (.not.(successCUDA)) then
1620 : print *,"trans_ev_tridi_to_band_&
1621 : &MATH_DATATYPE&
1622 0 : &: error cudaMemcpy"
1623 0 : stop 1
1624 : endif
1625 : else
1626 142584 : bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
1627 : endif
1628 :
1629 : #ifdef WITH_MPI
1630 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1631 : call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, &
1632 142584 : MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
1633 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1634 : #else /* WITH_MPI */
1635 0 : if (next_top_msg_length > 0) then
1636 : top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
1637 0 : bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
1638 : endif
1639 :
1640 : #endif /* WITH_MPI */
1641 :
1642 : #if REALCASE == 1
1643 : endif
1644 : #endif
1645 :
1646 : #endif /* WITH_OPENMP */
1647 :
1648 : #ifndef WITH_OPENMP
1649 : #if COMPLEXCASE == 1
1650 : endif
1651 : #endif
1652 : #endif
1653 : !compute
1654 : #ifdef WITH_OPENMP
1655 :
1656 581952 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
1657 :
1658 1163904 : !$omp parallel do private(my_thread), schedule(static, 1)
1659 : do my_thread = 1, max_threads
1660 : call compute_hh_trafo_&
1661 : &MATH_DATATYPE&
1662 : &_openmp_&
1663 : &PRECISION&
1664 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
1665 : nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1666 : #if REALCASE == 1
1667 : hh_dot_dev, &
1668 : #endif
1669 : hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
1670 : current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
1671 581952 : kernel)
1672 : enddo
1673 : !$omp end parallel do
1674 581952 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
1675 :
1676 : #else /* WITH_OPENMP */
1677 :
1678 : call compute_hh_trafo_&
1679 : &MATH_DATATYPE&
1680 : &_&
1681 : &PRECISION&
1682 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
1683 : a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1684 : #if REALCASE == 1
1685 : hh_dot_dev, &
1686 : #endif
1687 : hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, &
1688 : current_local_n-top_msg_length-bottom_msg_length, i, &
1689 581952 : last_stripe_width, kernel)
1690 :
1691 : #endif /* WITH_OPENMP */
1692 :
1693 : !wait_t
1694 959640 : if (top_msg_length>0) then
1695 : #ifdef WITH_OPENMP
1696 :
1697 : #ifdef WITH_MPI
1698 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1699 142584 : call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1700 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1701 : #endif
1702 :
1703 : #else /* WITH_OPENMP */
1704 :
1705 : #ifdef WITH_MPI
1706 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1707 142584 : call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
1708 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1709 : #endif
1710 142584 : if (useGPU) then
1711 0 : dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
1712 : successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
1713 : stripe_width * top_msg_length * size_of_datatype, &
1714 0 : cudaMemcpyHostToDevice)
1715 0 : if (.not.(successCUDA)) then
1716 : print *,"trans_ev_tridi_to_band_&
1717 : &MATH_DATATYPE&
1718 0 : &: error in cudaMemcpy"
1719 0 : stop 1
1720 : endif
1721 : else
1722 142584 : aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
1723 : endif
1724 : #endif /* WITH_OPENMP */
1725 : endif
1726 :
1727 : !compute
1728 : #ifdef WITH_OPENMP
1729 :
1730 581952 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
1731 :
1732 1163904 : !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
1733 : do my_thread = 1, max_threads
1734 581952 : if (top_msg_length>0) then
1735 142584 : b_len = csw*top_msg_length
1736 142584 : b_off = (my_thread-1)*b_len
1737 : aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
1738 142584 : reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
1739 : endif
1740 : call compute_hh_trafo_&
1741 : &MATH_DATATYPE&
1742 : &_openmp_&
1743 : &PRECISION&
1744 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
1745 : nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1746 : #if REALCASE == 1
1747 : hh_dot_dev, &
1748 : #endif
1749 : hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
1750 1163904 : thread_width, kernel)
1751 : enddo
1752 : !$omp end parallel do
1753 581952 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
1754 :
1755 : #else /* WITH_OPENMP */
1756 :
1757 : call compute_hh_trafo_&
1758 : &MATH_DATATYPE&
1759 : &_&
1760 : &PRECISION&
1761 : & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
1762 : a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
1763 : #if REALCASE == 1
1764 : hh_dot_dev, &
1765 : #endif
1766 : hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, &
1767 581952 : last_stripe_width, kernel)
1768 :
1769 : #endif /* WITH_OPENMP */
1770 : endif
1771 :
1772 959640 : if (next_top_msg_length > 0) then
1773 : !request top_border data
1774 : #ifdef WITH_OPENMP
1775 :
1776 142584 : b_len = csw*next_top_msg_length*max_threads
1777 : #ifdef WITH_MPI
1778 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1779 : call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1780 142584 : my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
1781 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1782 : #else /* WITH_MPI */
1783 : ! carefull the "recieve" has to be done at the corresponding wait or send
1784 : ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
1785 : ! bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
1786 : #endif /* WITH_MPI */
1787 :
1788 : #else /* WITH_OPENMP */
1789 :
1790 : #ifdef WITH_MPI
1791 142584 : if (wantDebug) call obj%timer%start("mpi_communication")
1792 : call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, &
1793 142584 : MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
1794 142584 : if (wantDebug) call obj%timer%stop("mpi_communication")
1795 : #else /* WITH_MPI */
1796 : ! carefull the "recieve" has to be done at the corresponding wait or send
1797 : ! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
1798 : ! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
1799 : #endif /* WITH_MPI */
1800 :
1801 : #endif /* WITH_OPENMP */
1802 :
1803 : endif
1804 :
1805 : !send_t
1806 1163904 : if (my_prow > 0) then
1807 : #ifdef WITH_OPENMP
1808 :
1809 : #ifdef WITH_MPI
1810 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1811 173424 : call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
1812 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1813 : #endif
1814 173424 : b_len = csw*nbw*max_threads
1815 173424 : top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
1816 :
1817 : #ifdef WITH_MPI
1818 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1819 : call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1820 173424 : my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr)
1821 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1822 : #else /* WITH_MPI */
1823 0 : if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
1824 0 : bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
1825 : endif
1826 0 : if (next_n_end < next_n) then
1827 0 : bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
1828 : endif
1829 : #endif /* WITH_MPI */
1830 :
1831 : #else /* WITH_OPENMP */
1832 :
1833 : #ifdef WITH_MPI
1834 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1835 173424 : call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
1836 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1837 : #endif
1838 173424 : if (useGPU) then
1839 0 : dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
1840 : successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
1841 : stripe_width*nbw * size_of_datatype, &
1842 0 : cudaMemcpyDeviceToHost)
1843 0 : if (.not.(successCUDA)) then
1844 : print *,"trans_ev_tridi_to_band_&
1845 : &MATH_DATATYPE&
1846 0 : &: error in cudaMemcpy"
1847 0 : stop 1
1848 : endif
1849 :
1850 : else
1851 173424 : top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i)
1852 : endif
1853 : #ifdef WITH_MPI
1854 173424 : if (wantDebug) call obj%timer%start("mpi_communication")
1855 : call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1856 173424 : my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr)
1857 173424 : if (wantDebug) call obj%timer%stop("mpi_communication")
1858 : #else /* WITH_MPI */
1859 0 : if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
1860 0 : bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
1861 : endif
1862 0 : if (next_n_end < next_n) then
1863 0 : bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i)
1864 : endif
1865 : #endif /* WITH_MPI */
1866 :
1867 : #endif /* WITH_OPENMP */
1868 : endif
1869 :
1870 : ! Care that there are not too many outstanding top_recv_request's
1871 1163904 : if (stripe_count > 1) then
1872 755376 : if (i>1) then
1873 :
1874 : #ifdef WITH_MPI
1875 619464 : if (wantDebug) call obj%timer%start("mpi_communication")
1876 619464 : call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
1877 619464 : if (wantDebug) call obj%timer%stop("mpi_communication")
1878 : #endif
1879 : else
1880 :
1881 : #ifdef WITH_MPI
1882 135912 : if (wantDebug) call obj%timer%start("mpi_communication")
1883 135912 : call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
1884 135912 : if (wantDebug) call obj%timer%stop("mpi_communication")
1885 : #endif
1886 :
1887 : endif
1888 : endif
1889 :
1890 : enddo
1891 :
1892 211584 : top_msg_length = next_top_msg_length
1893 :
1894 : else
1895 : ! wait for last top_send_request
1896 :
1897 : #ifdef WITH_MPI
1898 77112 : do i = 1, stripe_count
1899 61680 : if (wantDebug) call obj%timer%start("mpi_communication")
1900 61680 : call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
1901 61680 : if (wantDebug) call obj%timer%stop("mpi_communication")
1902 : enddo
1903 : #endif
1904 : endif
1905 :
1906 : ! Care about the result
1907 :
1908 227016 : if (my_prow == 0) then
1909 :
1910 : ! topmost process sends nbw rows to destination processes
1911 :
1912 541392 : do j=0, nfact-1
1913 409104 : num_blk = sweep*nfact+j ! global number of destination block, 0 based
1914 409104 : if (num_blk*nblk >= na) exit
1915 :
1916 390048 : nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
1917 :
1918 : #ifdef WITH_MPI
1919 195024 : if (wantDebug) call obj%timer%start("mpi_communication")
1920 195024 : call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
1921 195024 : if (wantDebug) call obj%timer%stop("mpi_communication")
1922 :
1923 : #endif
1924 390048 : dst = mod(num_blk, np_rows)
1925 :
1926 390048 : if (dst == 0) then
1927 292920 : if (useGPU) then
1928 0 : row_group_size = min(na - num_blk*nblk, nblk)
1929 : call pack_row_group_&
1930 : &MATH_DATATYPE&
1931 : &_gpu_&
1932 : &PRECISION&
1933 : &(row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
1934 0 : row_group(:, :), j * nblk + a_off, row_group_size)
1935 :
1936 0 : do i = 1, row_group_size
1937 0 : q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
1938 : enddo
1939 : else ! useGPU
1940 :
1941 4820712 : do i = 1, min(na - num_blk*nblk, nblk)
1942 : #ifdef WITH_OPENMP
1943 : call pack_row_&
1944 : &MATH_DATATYPE&
1945 : &_cpu_openmp_&
1946 : &PRECISION&
1947 2263896 : &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev)
1948 : #else /* WITH_OPENMP */
1949 :
1950 : call pack_row_&
1951 : &MATH_DATATYPE&
1952 : &_cpu_&
1953 : &PRECISION&
1954 2263896 : &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
1955 : #endif /* WITH_OPENMP */
1956 4527792 : q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
1957 : enddo
1958 : endif ! useGPU
1959 :
1960 : else ! (dst == 0)
1961 :
1962 97128 : if (useGPU) then
1963 : call pack_row_group_&
1964 : &MATH_DATATYPE&
1965 : &_gpu_&
1966 : &PRECISION&
1967 : &(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
1968 : last_stripe_width, a_dim2, l_nev, &
1969 0 : result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
1970 :
1971 : else ! useGPU
1972 1651176 : do i = 1, nblk
1973 : #if WITH_OPENMP
1974 : call pack_row_&
1975 : &MATH_DATATYPE&
1976 : &_cpu_openmp_&
1977 : &PRECISION&
1978 : &(obj,aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, &
1979 777024 : max_threads, thread_width, l_nev)
1980 : #else /* WITH_OPENMP */
1981 : call pack_row_&
1982 : &MATH_DATATYPE&
1983 : &_cpu_&
1984 : &PRECISION&
1985 777024 : &(obj, aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
1986 : #endif /* WITH_OPENMP */
1987 : enddo
1988 : endif ! useGPU
1989 : #ifdef WITH_MPI
1990 97128 : if (wantDebug) call obj%timer%start("mpi_communication")
1991 : call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
1992 97128 : dst, result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
1993 97128 : if (wantDebug) call obj%timer%stop("mpi_communication")
1994 :
1995 : #else /* WITH_MPI */
1996 0 : if (j+num_result_buffers < num_result_blocks) &
1997 0 : result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
1998 0 : if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
1999 0 : do j1 = 1, min(num_result_buffers, num_result_blocks)
2000 0 : result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
2001 : enddo
2002 : endif
2003 :
2004 : #endif /* WITH_MPI */
2005 : endif ! (dst == 0)
2006 : enddo !j=0, nfact-1
2007 :
2008 : else ! (my_prow == 0)
2009 :
2010 : ! receive and store final result
2011 :
2012 172800 : do j = num_bufs_recvd, num_result_blocks-1
2013 :
2014 141936 : nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
2015 :
2016 : ! If there is still work to do, just test for the next result request
2017 : ! and leave the loop if it is not ready, otherwise wait for all
2018 : ! outstanding requests
2019 :
2020 141936 : if (next_local_n > 0) then
2021 :
2022 : #ifdef WITH_MPI
2023 78024 : if (wantDebug) call obj%timer%start("mpi_communication")
2024 78024 : call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
2025 78024 : if (wantDebug) call obj%timer%stop("mpi_communication")
2026 :
2027 : #else /* WITH_MPI */
2028 0 : flag = .true.
2029 : #endif
2030 :
2031 78024 : if (.not.flag) exit
2032 :
2033 : else ! (next_local_n > 0)
2034 : #ifdef WITH_MPI
2035 63912 : if (wantDebug) call obj%timer%start("mpi_communication")
2036 63912 : call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
2037 63912 : if (wantDebug) call obj%timer%stop("mpi_communication")
2038 : #endif
2039 : endif ! (next_local_n > 0)
2040 :
2041 : ! Fill result buffer into q
2042 97128 : num_blk = j*np_rows + my_prow ! global number of current block, 0 based
2043 1504536 : do i = 1, min(na - num_blk*nblk, nblk)
2044 1407408 : q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
2045 : enddo
2046 :
2047 : ! Queue result buffer again if there are outstanding blocks left
2048 : #ifdef WITH_MPI
2049 97128 : if (wantDebug) call obj%timer%start("mpi_communication")
2050 :
2051 97128 : if (j+num_result_buffers < num_result_blocks) &
2052 : call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
2053 14880 : 0, result_recv_tag, mpi_comm_rows, result_recv_request(nbuf), mpierr)
2054 :
2055 : ! carefull the "recieve" has to be done at the corresponding wait or send
2056 : ! if (j+num_result_buffers < num_result_blocks) &
2057 : ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf)
2058 97128 : if (wantDebug) call obj%timer%stop("mpi_communication")
2059 :
2060 : #else /* WITH_MPI */
2061 :
2062 : #endif /* WITH_MPI */
2063 :
2064 : enddo ! j = num_bufs_recvd, num_result_blocks-1
2065 75672 : num_bufs_recvd = j
2066 :
2067 : endif ! (my_prow == 0)
2068 :
2069 : ! Shift the remaining rows to the front of aIntern (if necessary)
2070 :
2071 227016 : offset = nbw - top_msg_length
2072 227016 : if (offset<0) then
2073 0 : if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
2074 : &MATH_DATATYPE&
2075 0 : &: internal error, offset for shifting = ',offset
2076 0 : success = .false.
2077 0 : return
2078 : endif
2079 :
2080 227016 : a_off = a_off + offset
2081 227016 : if (a_off + next_local_n + nbw > a_dim2) then
2082 : #ifdef WITH_OPENMP
2083 31968 : if (useGPU) then
2084 0 : print *,"trans_ev_tridi_to_band_real: not yet implemented"
2085 0 : stop 1
2086 : endif
2087 :
2088 31968 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
2089 :
2090 63936 : !$omp parallel do private(my_thread, i, j), schedule(static, 1)
2091 : do my_thread = 1, max_threads
2092 165936 : do i = 1, stripe_count
2093 8208528 : do j = top_msg_length+1, top_msg_length+next_local_n
2094 400471872 : aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread)
2095 : enddo
2096 : enddo
2097 : enddo
2098 : !$omp end parallel do
2099 31968 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
2100 :
2101 : #else /* WITH_OPENMP */
2102 165936 : do i = 1, stripe_count
2103 133968 : if (useGPU) then
2104 0 : chunk = min(next_local_n - 1, a_off)
2105 0 : do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
2106 0 : top = min(j + chunk, top_msg_length + next_local_n)
2107 0 : this_chunk = top - j + 1
2108 0 : dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
2109 0 : dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
2110 : ! it is not logical to set here always the value for the parameter
2111 : ! "cudaMemcpyDeviceToDevice" do this ONCE at startup
2112 : ! tmp = cuda_d2d(1)
2113 : successCUDA = cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, &
2114 0 : stripe_width*this_chunk* size_of_datatype, cudaMemcpyDeviceToDevice)
2115 0 : if (.not.(successCUDA)) then
2116 : print *,"trans_ev_tridi_to_band_&
2117 : &MATH_DATATYPE&
2118 0 : &: error cudaMemcpy"
2119 0 : stop 1
2120 : endif
2121 : enddo
2122 : else ! not useGPU
2123 8208528 : do j = top_msg_length+1, top_msg_length+next_local_n
2124 8074560 : aIntern(:,j,i) = aIntern(:,j+a_off,i)
2125 : enddo
2126 : endif
2127 : enddo ! stripe_count
2128 : #endif /* WITH_OPENMP */
2129 :
2130 63936 : a_off = 0
2131 : endif
2132 : enddo
2133 :
2134 : ! Just for safety:
2135 : #ifdef WITH_MPI
2136 30864 : if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
2137 30864 : if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
2138 30864 : if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
2139 30864 : if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
2140 : #endif
2141 :
2142 30864 : if (my_prow == 0) then
2143 :
2144 : #ifdef WITH_MPI
2145 15432 : if (wantDebug) call obj%timer%start("mpi_communication")
2146 15432 : call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
2147 15432 : if (wantDebug) call obj%timer%stop("mpi_communication")
2148 : #endif
2149 : endif
2150 :
2151 : #ifdef WITH_MPI
2152 30864 : if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
2153 30864 : if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
2154 :
2155 : #ifdef HAVE_DETAILED_TIMINGS
2156 30864 : call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr)
2157 30864 : kernel_flops = kernel_flops_recv
2158 30864 : call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr)
2159 30864 : kernel_flops = kernel_flops_recv
2160 :
2161 30864 : call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr)
2162 30864 : kernel_time_recv = kernel_time
2163 30864 : call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr)
2164 30864 : kernel_time_recv = kernel_time
2165 : #endif
2166 :
2167 : #else /* WITH_MPI */
2168 :
2169 15432 : call obj%get("print_flops",print_flops)
2170 15432 : if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
2171 2208 : write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
2172 :
2173 : #endif /* WITH_MPI */
2174 :
2175 46296 : if (useGPU) then
2176 : ! copy q to q_dev needed in trans_ev_band_to_full
2177 0 : successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
2178 0 : if (.not.(successCUDA)) then
2179 : print *,"trans_ev_tridi_to_band_&
2180 : &MATH_DATATYPE&
2181 0 : &: error in cudaMalloc"
2182 0 : stop 1
2183 : endif
2184 :
2185 : ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
2186 : successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
2187 0 : cudaMemcpyHostToDevice)
2188 0 : if (.not.(successCUDA)) then
2189 : print *,"trans_ev_tridi_to_band_&
2190 : &MATH_DATATYPE&
2191 0 : &: error in cudaMalloc"
2192 0 : stop 1
2193 : endif
2194 : ! endif
2195 : endif !use GPU
2196 :
2197 : ! deallocate all working space
2198 :
2199 46296 : if (.not.(useGPU)) then
2200 46296 : nullify(aIntern)
2201 46296 : call free(aIntern_ptr)
2202 : endif
2203 :
2204 46296 : deallocate(row, stat=istat, errmsg=errorMessage)
2205 46296 : if (istat .ne. 0) then
2206 : print *,"trans_ev_tridi_to_band_&
2207 : &MATH_DATATYPE&
2208 0 : &: error when deallocating row "//errorMessage
2209 0 : stop 1
2210 : endif
2211 :
2212 46296 : deallocate(limits, stat=istat, errmsg=errorMessage)
2213 46296 : if (istat .ne. 0) then
2214 : print *,"trans_ev_tridi_to_band_&
2215 : &MATH_DATATYPE&
2216 0 : &: error when deallocating limits"//errorMessage
2217 0 : stop 1
2218 : endif
2219 :
2220 46296 : deallocate(result_send_request, stat=istat, errmsg=errorMessage)
2221 46296 : if (istat .ne. 0) then
2222 : print *,"trans_ev_tridi_to_band_&
2223 : &MATH_DATATYPE&
2224 0 : &: error when deallocating result_send_request "//errorMessage
2225 0 : stop 1
2226 : endif
2227 :
2228 46296 : deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
2229 46296 : if (istat .ne. 0) then
2230 : print *,"trans_ev_tridi_to_band_&
2231 : &MATH_DATATYPE&
2232 0 : &: error when deallocating result_recv_request "//errorMessage
2233 0 : stop 1
2234 : endif
2235 :
2236 46296 : deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
2237 46296 : if (istat .ne. 0) then
2238 : print *,"trans_ev_tridi_to_band_&
2239 : &MATH_DATATYPE&
2240 0 : &: error when deallocating top_border_send_buffer "//errorMessage
2241 0 : stop 1
2242 : endif
2243 :
2244 46296 : deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
2245 46296 : if (istat .ne. 0) then
2246 : print *,"trans_ev_tridi_to_band_&
2247 : &MATH_DATATYPE&
2248 0 : &: error when deallocating top_border_recv_buffer "//errorMessage
2249 0 : stop 1
2250 : endif
2251 :
2252 46296 : deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
2253 46296 : if (istat .ne. 0) then
2254 : print *,"trans_ev_tridi_to_band_&
2255 : &MATH_DATATYPE&
2256 0 : &: error when deallocating bottom_border_send_buffer "//errorMessage
2257 0 : stop 1
2258 : endif
2259 :
2260 46296 : deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
2261 46296 : if (istat .ne. 0) then
2262 : print *,"trans_ev_tridi_to_band_&
2263 : &MATH_DATATYPE&
2264 0 : &: error when deallocating bottom_border_recv_buffer "//errorMessage
2265 0 : stop 1
2266 : endif
2267 :
2268 46296 : deallocate(result_buffer, stat=istat, errmsg=errorMessage)
2269 46296 : if (istat .ne. 0) then
2270 : print *,"trans_ev_tridi_to_band_&
2271 : &MATH_DATATYPE&
2272 0 : &: error when deallocating result_buffer "//errorMessage
2273 0 : stop 1
2274 : endif
2275 :
2276 46296 : deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
2277 46296 : if (istat .ne. 0) then
2278 : print *,"trans_ev_tridi_to_band_&
2279 : &MATH_DATATYPE&
2280 0 : &: error when deallocating bcast_buffer "//errorMessage
2281 0 : stop 1
2282 : endif
2283 :
2284 46296 : deallocate(top_send_request, stat=istat, errmsg=errorMessage)
2285 46296 : if (istat .ne. 0) then
2286 : print *,"trans_ev_tridi_to_band_&
2287 : &MATH_DATATYPE&
2288 0 : &: error when deallocating top_send_request "//errorMessage
2289 0 : stop 1
2290 : endif
2291 :
2292 46296 : deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
2293 46296 : if (istat .ne. 0) then
2294 : print *,"trans_ev_tridi_to_band_&
2295 : &MATH_DATATYPE&
2296 0 : &: error when deallocating top_recv_request "//errorMessage
2297 0 : stop 1
2298 : endif
2299 :
2300 46296 : deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
2301 46296 : if (istat .ne. 0) then
2302 : print *,"trans_ev_tridi_to_band_&
2303 : &MATH_DATATYPE&
2304 0 : &: error when deallocating bottom_send_request "//errorMessage
2305 0 : stop 1
2306 : endif
2307 :
2308 46296 : deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
2309 46296 : if (istat .ne. 0) then
2310 : print *,"trans_ev_tridi_to_band_&
2311 : &MATH_DATATYPE&
2312 0 : &: error when deallocating bottom_recv_request "//errorMessage
2313 0 : stop 1
2314 : endif
2315 :
2316 46296 : if (useGPU) then
2317 : #if COMPLEXCASE == 1
2318 : ! should this not hbe done always?
2319 0 : successCUDA = cuda_free(aIntern_dev)
2320 0 : if (.not.(successCUDA)) then
2321 0 : print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
2322 0 : stop 1
2323 : endif
2324 : #endif
2325 0 : successCUDA = cuda_free(hh_dot_dev)
2326 0 : if (.not.(successCUDA)) then
2327 : print *,"trans_ev_tridi_to_band_&
2328 : &MATH_DATATYPE&
2329 0 : &real: error in cudaFree "//errorMessage
2330 0 : stop 1
2331 : endif
2332 :
2333 0 : successCUDA = cuda_free(hh_tau_dev)
2334 0 : if (.not.(successCUDA)) then
2335 : print *,"trans_ev_tridi_to_band_&
2336 : &MATH_DATATYPE&
2337 0 : &: error in cudaFree "//errorMessage
2338 0 : stop 1
2339 : endif
2340 :
2341 0 : successCUDA = cuda_free(row_dev)
2342 0 : if (.not.(successCUDA)) then
2343 : print *,"trans_ev_tridi_to_band_&
2344 : &MATH_DATATYPE&
2345 0 : &: error in cudaFree "//errorMessage
2346 0 : stop 1
2347 : endif
2348 :
2349 0 : deallocate(row_group, stat=istat, errmsg=errorMessage)
2350 0 : if (istat .ne. 0) then
2351 : print *,"trans_ev_tridi_to_band_&
2352 : &MATH_DATATYPE&
2353 0 : &: error when deallocating row_group "//errorMessage
2354 0 : stop 1
2355 : endif
2356 :
2357 0 : successCUDA = cuda_free(row_group_dev)
2358 0 : if (.not.(successCUDA)) then
2359 : print *,"trans_ev_tridi_to_band_&
2360 : &MATH_DATATYPE&
2361 0 : &: error in cudaFree "//errorMessage
2362 0 : stop 1
2363 : endif
2364 :
2365 0 : successCUDA = cuda_free(bcast_buffer_dev)
2366 0 : if (.not.(successCUDA)) then
2367 : print *,"trans_ev_tridi_to_band_&
2368 : &MATH_DATATYPE&
2369 0 : &: error in cudaFree "//errorMessage
2370 0 : stop 1
2371 : endif
2372 : endif ! useGPU
2373 :
2374 :
2375 : call obj%timer%stop("trans_ev_tridi_to_band_&
2376 : &MATH_DATATYPE&
2377 : &" // &
2378 : &PRECISION_SUFFIX&
2379 46296 : )
2380 :
2381 46296 : return
2382 : !#if COMPLEXCASE == 1
2383 : ! contains
2384 : ! ! The host wrapper for extracting "tau" from the HH reflectors (see the
2385 : ! ! kernel below)
2386 : ! subroutine extract_hh_tau_complex_gpu_&
2387 : ! &PRECISION&
2388 : ! &(nbw, n, is_zero)
2389 : ! use cuda_c_kernel
2390 : ! use pack_unpack_gpu
2391 : ! use precision
2392 : ! implicit none
2393 : ! integer(kind=ik), value :: nbw, n
2394 : ! logical, value :: is_zero
2395 : ! integer(kind=ik) :: val_is_zero
2396 : !
2397 : ! if (is_zero) then
2398 : ! val_is_zero = 1
2399 : ! else
2400 : ! val_is_zero = 0
2401 : ! endif
2402 : ! call launch_extract_hh_tau_c_kernel_complex_&
2403 : ! &PRECISION&
2404 : ! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
2405 : ! end subroutine
2406 : !#endif /* COMPLEXCASE */
2407 :
2408 46296 : end subroutine
2409 :
2410 : ! vim: syntax=fortran
|