Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : ! Copyright of the original code rests with the authors inside the ELPA
47 : ! consortium. The copyright of any additional modifications shall rest
48 : ! with their original authors, but shall adhere to the licensing terms
49 : ! distributed along with the original code in the file "COPYING".
50 : #endif
51 :
52 : #include "../general/sanity.F90"
53 :
54 : #if REALCASE == 1
55 : subroutine tridiag_band_real_&
56 : #endif
57 : #if COMPLEXCASE == 1
58 : subroutine tridiag_band_complex_&
59 : #endif
60 47448 : &PRECISION &
61 47448 : (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
62 : hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
63 : !-------------------------------------------------------------------------------
64 : ! tridiag_band_real/complex:
65 : ! Reduces a real symmetric band matrix to tridiagonal form
66 : !
67 : ! na Order of matrix a
68 : !
69 : ! nb Semi bandwith
70 : !
71 : ! nblk blocksize of cyclic distribution, must be the same in both directions!
72 : !
73 : ! aMatrix(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
74 : !
75 : ! lda Leading dimension of a
76 : ! matrixCols local columns of matrix a
77 : !
78 : ! hh_trans : housholder vectors
79 : !
80 : ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
81 : !
82 : ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
83 : !
84 : ! mpi_comm_rows
85 : ! mpi_comm_cols
86 : ! MPI-Communicators for rows/columns
87 : ! communicator
88 : ! MPI-Communicator for the total processor set
89 : !-------------------------------------------------------------------------------
90 : use elpa_abstract_impl
91 : use elpa2_workload
92 : use precision
93 : use iso_c_binding
94 : use redist
95 : implicit none
96 : #include "../general/precision_kinds.F90"
97 : class(elpa_abstract_impl_t), intent(inout) :: obj
98 : logical, intent(in) :: useGPU, wantDebug
99 : integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
100 : #ifdef USE_ASSUMED_SIZE
101 : MATH_DATATYPE(kind=rck), intent(in) :: aMatrix(lda,*)
102 : #else
103 : MATH_DATATYPE(kind=rck), intent(in) :: aMatrix(lda,matrixCols)
104 : #endif
105 : integer(kind=c_intptr_t) :: a_dev
106 : real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
107 : MATH_DATATYPE(kind=rck), intent(out), allocatable :: hh_trans(:,:)
108 :
109 : real(kind=rk) :: vnorm2
110 284688 : MATH_DATATYPE(kind=rck) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
111 189792 : MATH_DATATYPE(kind=rck) :: hd(nb), hs(nb)
112 :
113 : integer(kind=ik) :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
114 : integer(kind=ik) :: my_pe, n_pes, mpierr
115 : integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
116 : integer(kind=ik) :: ireq_ab, ireq_hv
117 : integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
118 : #ifdef WITH_OPENMP
119 : integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
120 : #ifdef WITH_MPI
121 : ! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE)
122 : #endif
123 : ! integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
124 23724 : integer(kind=ik), allocatable :: global_id_tmp(:,:)
125 23724 : integer(kind=ik), allocatable :: omp_block_limits(:)
126 47448 : MATH_DATATYPE(kind=rck), allocatable :: hv_t(:,:), tau_t(:)
127 : integer(kind=ik) :: omp_get_max_threads
128 : #endif /* WITH_OPENMP */
129 142344 : integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
130 71172 : integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
131 47448 : integer(kind=ik), allocatable :: block_limits(:)
132 94896 : MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
133 : integer :: istat
134 : character(200) :: errorMessage
135 :
136 : #ifndef WITH_MPI
137 : integer(kind=ik) :: startAddr
138 : #endif
139 : call obj%timer%start("tridiag_band_&
140 : &MATH_DATATYPE&
141 : &" // &
142 : &PRECISION_SUFFIX &
143 47448 : )
144 47448 : if (wantDebug) call obj%timer%start("mpi_communication")
145 47448 : call mpi_comm_rank(communicator,my_pe,mpierr)
146 47448 : call mpi_comm_size(communicator,n_pes,mpierr)
147 :
148 47448 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
149 47448 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
150 47448 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
151 47448 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
152 47448 : if (wantDebug) call obj%timer%stop("mpi_communication")
153 :
154 : ! Get global_id mapping 2D procssor coordinates to global id
155 :
156 47448 : allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
157 47448 : if (istat .ne. 0) then
158 : print *,"tridiag_band_&
159 : &MATH_DATATYPE&
160 0 : &: error when allocating global_id "//errorMessage
161 0 : stop 1
162 : endif
163 :
164 47448 : global_id(:,:) = 0
165 47448 : global_id(my_prow, my_pcol) = my_pe
166 :
167 : #ifdef WITH_OPENMP
168 23724 : allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
169 23724 : if (istat .ne. 0) then
170 : print *,"tridiag_band_&
171 : &MATH_DATATYPE&
172 0 : &: error when allocating global_id_tmp "//errorMessage
173 0 : stop 1
174 : endif
175 : #endif
176 :
177 : #ifdef WITH_MPI
178 31632 : if (wantDebug) call obj%timer%start("mpi_communication")
179 : #ifndef WITH_OPENMP
180 15816 : call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
181 : #else
182 15816 : global_id_tmp(:,:) = global_id(:,:)
183 15816 : call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
184 15816 : deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
185 15816 : if (istat .ne. 0) then
186 : print *,"tridiag_band_&
187 : &MATH_DATATYPE&
188 0 : &: error when deallocating global_id_tmp "//errorMessage
189 0 : stop 1
190 : endif
191 : #endif /* WITH_OPENMP */
192 31632 : if (wantDebug) call obj%timer%stop("mpi_communication")
193 : #endif /* WITH_MPI */
194 :
195 : ! Total number of blocks in the band:
196 :
197 47448 : nblocks_total = (na-1)/nb + 1
198 :
199 : ! Set work distribution
200 :
201 47448 : allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
202 47448 : if (istat .ne. 0) then
203 : print *,"tridiag_band_&
204 : &MATH_DATATYPE&
205 0 : &: error when allocating block_limits"//errorMessage
206 0 : stop 1
207 : endif
208 :
209 47448 : call divide_band(obj,nblocks_total, n_pes, block_limits)
210 :
211 : ! nblocks: the number of blocks for my task
212 47448 : nblocks = block_limits(my_pe+1) - block_limits(my_pe)
213 :
214 : ! allocate the part of the band matrix which is needed by this PE
215 : ! The size is 1 block larger than needed to avoid extensive shifts
216 47448 : allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
217 47448 : if (istat .ne. 0) then
218 : print *,"tridiag_band_&
219 : &MATH_DATATYPE&
220 0 : &: error when allocating ab"//errorMessage
221 0 : stop 1
222 : endif
223 :
224 47448 : ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
225 :
226 : ! n_off: Offset of ab within band
227 47448 : n_off = block_limits(my_pe)*nb
228 :
229 : ! Redistribute band in a to ab
230 : call redist_band_&
231 : &MATH_DATATYPE&
232 : &_&
233 : &PRECISION&
234 47448 : &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
235 :
236 : ! Calculate the workload for each sweep in the back transformation
237 : ! and the space requirements to hold the HH vectors
238 :
239 47448 : allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
240 47448 : if (istat .ne. 0) then
241 : print *,"tridiag_band_&
242 : &MATH_DATATYPE&
243 0 : &: error when allocating limits"//errorMessage
244 0 : stop 1
245 : endif
246 :
247 47448 : call determine_workload(obj,na, nb, np_rows, limits)
248 47448 : max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
249 :
250 47448 : num_hh_vecs = 0
251 47448 : num_chunks = 0
252 47448 : nx = na
253 278784 : do n = 1, nblocks_total
254 231336 : call determine_workload(obj, nx, nb, np_rows, limits)
255 231336 : local_size = limits(my_prow+1) - limits(my_prow)
256 : ! add to number of householder vectors
257 : ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
258 231336 : if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
259 215520 : num_hh_vecs = num_hh_vecs + local_size
260 215520 : num_chunks = num_chunks+1
261 : endif
262 231336 : nx = nx - nb
263 : enddo
264 :
265 : ! Allocate space for HH vectors
266 :
267 47448 : allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
268 47448 : if (istat .ne. 0) then
269 : #if REALCASE == 1
270 0 : print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
271 : #endif
272 : #if COMPLEXCASE == 1
273 0 : print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
274 : #endif
275 0 : stop 1
276 : endif
277 :
278 : ! Allocate and init MPI requests
279 :
280 47448 : allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
281 47448 : if (istat .ne. 0) then
282 : print *,"tridiag_band_&
283 : &MATH_DATATYPE&
284 0 : &: error when allocating ireq_hhr"//errorMessage
285 0 : stop 1
286 : endif
287 47448 : allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
288 47448 : if (istat .ne. 0) then
289 : print *,"tridiag_band_&
290 : &MATH_DATATYEP&
291 0 : &: error when allocating ireq_hhs"//errorMessage
292 0 : stop 1
293 : endif
294 :
295 47448 : num_hh_vecs = 0
296 47448 : num_chunks = 0
297 47448 : nx = na
298 47448 : nt = 0
299 278784 : do n = 1, nblocks_total
300 231336 : call determine_workload(obj,nx, nb, np_rows, limits)
301 231336 : local_size = limits(my_prow+1) - limits(my_prow)
302 231336 : if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
303 215520 : num_chunks = num_chunks+1
304 : #ifdef WITH_MPI
305 138408 : if (wantDebug) call obj%timer%start("mpi_communication")
306 : call mpi_irecv(hh_trans(1,num_hh_vecs+1), &
307 : nb*local_size, &
308 : #if REALCASE == 1
309 : MPI_REAL_PRECISION, &
310 : #endif
311 : #if COMPLEXCASE == 1
312 : MPI_COMPLEX_EXPLICIT_PRECISION, &
313 : #endif
314 138408 : nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
315 138408 : if (wantDebug) call obj%timer%stop("mpi_communication")
316 :
317 : #else /* WITH_MPI */
318 : ! carefull non-block recv data copy must be done at wait or send
319 : ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
320 :
321 : #endif /* WITH_MPI */
322 215520 : num_hh_vecs = num_hh_vecs + local_size
323 : endif
324 231336 : nx = nx - nb
325 231336 : if (n == block_limits(nt+1)) then
326 79080 : nt = nt + 1
327 : endif
328 : enddo
329 : #ifdef WITH_MPI
330 31632 : ireq_hhs(:) = MPI_REQUEST_NULL
331 : #endif
332 : ! Buffers for gathering/sending the HH vectors
333 :
334 47448 : allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
335 47448 : if (istat .ne. 0) then
336 : print *,"tridiag_band_&
337 : &MATH_DATATYPE&
338 0 : &: error when allocating hh_gath"//errorMessage
339 0 : stop 1
340 : endif
341 :
342 47448 : allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
343 47448 : if (istat .ne. 0) then
344 : print *,"tridiag_band_&
345 : &MATH_DATATYPE&
346 0 : &: error when allocating hh_send"//errorMessage
347 0 : stop 1
348 : endif
349 :
350 47448 : hh_gath(:,:,:) = 0.0_rck
351 47448 : hh_send(:,:,:) = 0.0_rck
352 :
353 : ! Some counters
354 :
355 47448 : allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
356 47448 : if (istat .ne. 0) then
357 : print *,"tridiag_band_&
358 : &MATH_DATATYPE&
359 0 : &: error when allocating hh_cnt"//errorMessage
360 0 : stop 1
361 : endif
362 :
363 47448 : allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
364 47448 : if (istat .ne. 0) then
365 : print *,"tridiag_band_&
366 : &MATH_DATATYPE&
367 0 : &: error when allocating hh_dst"//errorMessage
368 0 : stop 1
369 : endif
370 :
371 47448 : hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
372 47448 : hh_dst(:) = 0 ! PE number for receive
373 : #ifdef WITH_MPI
374 31632 : ireq_ab = MPI_REQUEST_NULL
375 31632 : ireq_hv = MPI_REQUEST_NULL
376 : #endif
377 : ! Limits for sending
378 :
379 47448 : allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
380 47448 : if (istat .ne. 0) then
381 : print *,"tridiag_band_&
382 : &MATH_DATATYPE&
383 0 : &: error when allocating snd_limits"//errorMessage
384 0 : stop 1
385 : endif
386 201672 : do iblk=1,nblocks
387 154224 : call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
388 : enddo
389 :
390 : #ifdef WITH_OPENMP
391 : ! OpenMP work distribution:
392 :
393 23724 : max_threads = 1
394 : #if REALCASE == 1
395 14292 : max_threads = omp_get_max_threads()
396 : #endif
397 : #if COMPLEXCASE == 1
398 9432 : !$ max_threads = omp_get_max_threads()
399 : #endif
400 : ! For OpenMP we need at least 2 blocks for every thread
401 23724 : max_threads = MIN(max_threads, nblocks/2)
402 23724 : if (max_threads==0) max_threads = 1
403 :
404 23724 : allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
405 23724 : if (istat .ne. 0) then
406 : print *,"tridiag_band_&
407 : &MATH_DATATYPE&
408 0 : &: error when allocating omp_block_limits"//errorMessage
409 0 : stop 1
410 : endif
411 :
412 : ! Get the OpenMP block limits
413 23724 : call divide_band(obj,nblocks, max_threads, omp_block_limits)
414 :
415 23724 : allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
416 23724 : if (istat .ne. 0) then
417 : print *,"tridiag_band_&
418 : &MATH_DATATYPE&
419 0 : &: error when allocating hv_t, tau_t"//errorMessage
420 0 : stop 1
421 : endif
422 :
423 23724 : hv_t = 0.0_rck
424 23724 : tau_t = 0.0_rck
425 : #endif /* WITH_OPENMP */
426 :
427 : ! ---------------------------------------------------------------------------
428 : ! Start of calculations
429 :
430 47448 : na_s = block_limits(my_pe)*nb + 1
431 :
432 47448 : if (my_pe>0 .and. na_s<=na) then
433 : ! send first column to previous PE
434 : ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
435 15816 : ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
436 : #ifdef WITH_MPI
437 15816 : if (wantDebug) call obj%timer%start("mpi_communication")
438 : call mpi_isend(ab_s, nb+1, &
439 : #if REALCASE == 1
440 : MPI_REAL_PRECISION, &
441 : #endif
442 : #if COMPLEXCASE == 1
443 : MPI_COMPLEX_EXPLICIT_PRECISION, &
444 : #endif
445 15816 : my_pe-1, 1, communicator, ireq_ab, mpierr)
446 15816 : if (wantDebug) call obj%timer%stop("mpi_communication")
447 : #endif /* WITH_MPI */
448 : endif
449 :
450 : #ifndef WITH_MPI
451 15816 : startAddr = ubound(hh_trans,dim=2)
452 : #endif /* WITH_MPI */
453 :
454 : #ifdef WITH_OPENMP
455 3857352 : do istep=1,na-1-block_limits(my_pe)*nb
456 : #else
457 4537800 : do istep=1,na-1
458 : #endif
459 :
460 8347704 : if (my_pe==0) then
461 6018768 : n = MIN(na-na_s,nb) ! number of rows to be reduced
462 6018768 : hv(:) = 0.0_rck
463 6018768 : tau = 0.0_rck
464 :
465 : ! Transform first column of remaining matrix
466 : #if REALCASE == 1
467 : ! The last step (istep=na-1) is only needed for sending the last HH vectors.
468 : ! We don't want the sign of the last element flipped (analogous to the other sweeps)
469 : #endif
470 : #if COMPLEXCASE == 1
471 : ! Opposed to the real case, the last step (istep=na-1) is needed here for making
472 : ! the last subdiagonal element a real number
473 : #endif
474 :
475 : #if REALCASE == 1
476 3492144 : if (istep < na-1) then
477 : ! Transform first column of remaining matrix
478 3473088 : vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
479 : #endif
480 : #if COMPLEXCASE == 1
481 : #ifdef DOUBLE_PRECISION_COMPLEX
482 1748352 : vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
483 : #else
484 778272 : vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
485 : #endif
486 2526624 : if (n<2) vnorm2 = 0. ! Safety only
487 : #endif /* COMPLEXCASE */
488 :
489 : #if REALCASE == 1
490 : call hh_transform_real_&
491 : #endif
492 : #if COMPLEXCASE == 1
493 : call hh_transform_complex_&
494 : #endif
495 : &PRECISION &
496 5999712 : (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
497 :
498 5999712 : hv(1) = 1.0_rck
499 5999712 : hv(2:n) = ab(3:n+1,na_s-n_off)*hf
500 : #if REALCASE == 1
501 : endif
502 : #endif
503 :
504 : #if REALCASE == 1
505 3492144 : d(istep) = ab(1,na_s-n_off)
506 3492144 : e(istep) = ab(2,na_s-n_off)
507 : #endif
508 : #if COMPLEXCASE == 1
509 2526624 : d(istep) = real(ab(1,na_s-n_off), kind=rk)
510 2526624 : e(istep) = real(ab(2,na_s-n_off), kind=rk)
511 : #endif
512 :
513 6018768 : if (istep == na-1) then
514 : #if REALCASE == 1
515 19056 : d(na) = ab(1,na_s+1-n_off)
516 : #endif
517 : #if COMPLEXCASE == 1
518 12576 : d(na) = real(ab(1,na_s+1-n_off),kind=rk)
519 : #endif
520 31632 : e(na) = 0.0_rck
521 : endif
522 : else
523 2328936 : if (na>na_s) then
524 : ! Receive Householder Vector from previous task, from PE owning subdiagonal
525 :
526 : #ifdef WITH_OPENMP
527 :
528 : #ifdef WITH_MPI
529 824244 : if (wantDebug) call obj%timer%start("mpi_communication")
530 : call mpi_recv(hv, nb, &
531 : #if REALCASE == 1
532 : MPI_REAL_PRECISION, &
533 : #endif
534 : #if COMPLEXCASE == 1
535 : MPI_COMPLEX_EXPLICIT_PRECISION, &
536 : #endif
537 824244 : my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
538 824244 : if (wantDebug) call obj%timer%stop("mpi_communication")
539 :
540 : #else /* WITH_MPI */
541 :
542 0 : hv(1:nb) = hv_s(1:nb)
543 :
544 : #endif /* WITH_MPI */
545 :
546 : #else /* WITH_OPENMP */
547 :
548 : #ifdef WITH_MPI
549 824244 : if (wantDebug) call obj%timer%start("mpi_communication")
550 :
551 : call mpi_recv(hv, nb, &
552 : #if REALCASE == 1
553 : MPI_REAL_PRECISION, &
554 : #endif
555 : #if COMPLEXCASE == 1
556 : MPI_COMPLEX_EXPLICIT_PRECISION, &
557 : #endif
558 824244 : my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
559 824244 : if (wantDebug) call obj%timer%stop("mpi_communication")
560 :
561 : #else /* WITH_MPI */
562 0 : hv(1:nb) = hv_s(1:nb)
563 : #endif /* WITH_MPI */
564 :
565 : #endif /* WITH_OPENMP */
566 1648488 : tau = hv(1)
567 1648488 : hv(1) = 1.0_rck
568 : endif
569 : endif
570 :
571 8347704 : na_s = na_s+1
572 8347704 : if (na_s-n_off > nb) then
573 168300 : ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
574 168300 : ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
575 168300 : n_off = n_off + nb
576 : endif
577 :
578 : #ifdef WITH_OPENMP
579 3833628 : if (max_threads > 1) then
580 :
581 : ! Codepath for OpenMP
582 :
583 : ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
584 : ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
585 : ! This simulates the behaviour of the MPI tasks which also work after each other.
586 : ! The code would be considerably easier, if the MPI communication would be made within
587 : ! the parallel region - this is avoided here since this would require
588 : ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
589 :
590 0 : hv_t(:,1) = hv
591 0 : tau_t(1) = tau
592 :
593 0 : do iter = 1, 2
594 :
595 : ! iter=1 : work on first block
596 : ! iter=2 : work on remaining blocks
597 : ! This is done in 2 iterations so that we have a barrier in between:
598 : ! After the first iteration, it is guaranteed that the last row of the last block
599 : ! is completed by the next thread.
600 : ! After the first iteration it is also the place to exchange the last row
601 : ! with MPI calls
602 0 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
603 :
604 : !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
605 0 : !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
606 : do my_thread = 1, max_threads
607 :
608 0 : if (iter == 1) then
609 0 : my_block_s = omp_block_limits(my_thread-1) + 1
610 0 : my_block_e = my_block_s
611 : else
612 0 : my_block_s = omp_block_limits(my_thread-1) + 2
613 0 : my_block_e = omp_block_limits(my_thread)
614 : endif
615 :
616 0 : do iblk = my_block_s, my_block_e
617 :
618 0 : ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
619 0 : ne = ns+nb-1 ! last column in block
620 :
621 0 : if (istep<my_thread .or. ns+n_off>na) exit
622 :
623 0 : hv = hv_t(:,my_thread)
624 0 : tau = tau_t(my_thread)
625 :
626 : ! Store Householder Vector for back transformation
627 :
628 0 : hh_cnt(iblk) = hh_cnt(iblk) + 1
629 :
630 0 : hh_gath(1 ,hh_cnt(iblk),iblk) = tau
631 0 : hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
632 :
633 0 : nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
634 0 : nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
635 : ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
636 :
637 : ! Transform diagonal block
638 0 : if (wantDebug) call obj%timer%start("blas")
639 : #if REALCASE == 1
640 0 : call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
641 : #endif
642 : #if COMPLEXCASE == 1
643 0 : call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
644 : #endif
645 0 : if (wantDebug) call obj%timer%stop("blas")
646 : #if REALCASE == 1
647 0 : x = dot_product(hv(1:nc),hd(1:nc))*tau
648 : #endif
649 : #if COMPLEXCASE == 1
650 0 : x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
651 : #endif
652 0 : hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
653 0 : if (wantDebug) call obj%timer%start("blas")
654 : #if REALCASE == 1
655 0 : call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
656 : #endif
657 : #if COMPLEXCASE == 1
658 0 : call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
659 : #endif
660 0 : if (wantDebug) call obj%timer%stop("blas")
661 0 : hv_t(:,my_thread) = 0.0_rck
662 0 : tau_t(my_thread) = 0.0_rck
663 0 : if (nr<=0) cycle ! No subdiagonal block present any more
664 :
665 : ! Transform subdiagonal block
666 0 : if (wantDebug) call obj%timer%start("blas")
667 0 : call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
668 0 : if (wantDebug) call obj%timer%stop("blas")
669 0 : if (nr>1) then
670 :
671 : ! complete (old) Householder transformation for first column
672 :
673 0 : ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
674 :
675 : ! calculate new Householder transformation for first column
676 : ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
677 :
678 : #if REALCASE == 1
679 0 : vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
680 : #endif
681 : #if COMPLEXCASE == 1
682 : #ifdef DOUBLE_PRECISION_COMPLEX
683 0 : vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
684 : #else
685 0 : vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
686 : #endif
687 : #endif /* COMPLEXCASE */
688 :
689 : #if REALCASE == 1
690 : call hh_transform_real_&
691 : #endif
692 : #if COMPLEXCASE == 1
693 : call hh_transform_complex_&
694 : #endif
695 : &PRECISION &
696 0 : (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
697 :
698 0 : hv_t(1 ,my_thread) = 1.0_rck
699 0 : hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
700 0 : ab(nb+2:,ns) = 0.0_rck
701 : ! update subdiagonal block for old and new Householder transformation
702 : ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
703 0 : if (wantDebug) call obj%timer%start("blas")
704 : call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
705 0 : nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, ZERO, h(2), 1)
706 0 : if (wantDebug) call obj%timer%stop("blas")
707 :
708 0 : x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
709 0 : h(2:nb) = h(2:nb) - x*hv(2:nb)
710 : ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
711 0 : do i=2,nb
712 : ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)* &
713 : #if REALCASE == 1
714 0 : h(i) - hs(1:nr)*hv(i)
715 : #endif
716 : #if COMPLEXCASE == 1
717 0 : conjg(h(i)) - hs(1:nr)*conjg(hv(i))
718 : #endif
719 : enddo
720 :
721 : else
722 :
723 : ! No new Householder transformation for nr=1, just complete the old one
724 0 : ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
725 0 : do i=2,nb
726 : #if REALCASE == 1
727 0 : ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
728 : #endif
729 : #if COMPLEXCASE == 1
730 0 : ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
731 : #endif
732 : enddo
733 : ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
734 0 : hv_t(1,my_thread) = 1.0_rck
735 : endif
736 :
737 : enddo
738 :
739 : enddo ! my_thread
740 : !$omp end parallel do
741 :
742 0 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
743 :
744 0 : if (iter==1) then
745 : ! We are at the end of the first block
746 :
747 : ! Send our first column to previous PE
748 0 : if (my_pe>0 .and. na_s <= na) then
749 : #ifdef WITH_MPI
750 0 : if (wantDebug) call obj%timer%start("mpi_communication")
751 0 : call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
752 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
753 :
754 : #endif
755 0 : ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
756 : #ifdef WITH_MPI
757 0 : if (wantDebug) call obj%timer%start("mpi_communication")
758 : call mpi_isend(ab_s, nb+1, &
759 : #if REALCASE == 1
760 : MPI_REAL_PRECISION, &
761 : #endif
762 : #if COMPLEXCASE == 1
763 : MPI_COMPLEX_EXPLICIT_PRECISION, &
764 : #endif
765 0 : my_pe-1, 1, communicator, ireq_ab, mpierr)
766 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
767 :
768 : #endif /* WITH_MPI */
769 : endif
770 :
771 : ! Request last column from next PE
772 0 : ne = na_s + nblocks*nb - (max_threads-1) - 1
773 : #ifdef WITH_MPI
774 0 : if (wantDebug) call obj%timer%start("mpi_communication")
775 :
776 0 : if (istep>=max_threads .and. ne <= na) then
777 : call mpi_recv(ab(1,ne-n_off), nb+1, &
778 : #if REALCASE == 1
779 : MPI_REAL_PRECISION, &
780 : #endif
781 : #if COMPLEXCASE == 1
782 : MPI_COMPLEX_EXPLICIT_PRECISION, &
783 : #endif
784 0 : my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
785 : endif
786 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
787 : #else /* WITH_MPI */
788 0 : if (istep>=max_threads .and. ne <= na) then
789 0 : ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
790 : endif
791 : #endif /* WITH_MPI */
792 : else
793 : ! We are at the end of all blocks
794 :
795 : ! Send last HH Vector and TAU to next PE if it has been calculated above
796 0 : ne = na_s + nblocks*nb - (max_threads-1) - 1
797 0 : if (istep>=max_threads .and. ne < na) then
798 : #ifdef WITH_MPI
799 0 : if (wantDebug) call obj%timer%start("mpi_communication")
800 0 : call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
801 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
802 : #endif
803 0 : hv_s(1) = tau_t(max_threads)
804 0 : hv_s(2:) = hv_t(2:,max_threads)
805 :
806 : #ifdef WITH_MPI
807 0 : if (wantDebug) call obj%timer%start("mpi_communication")
808 : call mpi_isend(hv_s, nb, &
809 : #if REALCASE == 1
810 : MPI_REAL_PRECISION, &
811 : #endif
812 : #if COMPLEXCASE == 1
813 : MPI_COMPLEX_EXPLICIT_PRECISION, &
814 : #endif
815 0 : my_pe+1, 2, communicator, ireq_hv, mpierr)
816 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
817 :
818 : #endif /* WITH_MPI */
819 : endif
820 :
821 : ! "Send" HH Vector and TAU to next OpenMP thread
822 0 : do my_thread = max_threads, 2, -1
823 0 : hv_t(:,my_thread) = hv_t(:,my_thread-1)
824 0 : tau_t(my_thread) = tau_t(my_thread-1)
825 : enddo
826 :
827 : endif
828 : enddo ! iter
829 :
830 : else
831 :
832 : ! Codepath for 1 thread without OpenMP
833 :
834 : ! The following code is structured in a way to keep waiting times for
835 : ! other PEs at a minimum, especially if there is only one block.
836 : ! For this reason, it requests the last column as late as possible
837 : ! and sends the Householder Vector and the first column as early
838 : ! as possible.
839 :
840 : #endif /* WITH_OPENMP */
841 :
842 37016424 : do iblk=1,nblocks
843 33908016 : ns = na_s + (iblk-1)*nb - n_off ! first column in block
844 33908016 : ne = ns+nb-1 ! last column in block
845 :
846 33908016 : if (ns+n_off>na) exit
847 :
848 : ! Store Householder Vector for back transformation
849 :
850 28668720 : hh_cnt(iblk) = hh_cnt(iblk) + 1
851 :
852 28668720 : hh_gath(1 ,hh_cnt(iblk),iblk) = tau
853 28668720 : hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
854 :
855 : #ifndef WITH_OPENMP
856 14334360 : if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
857 : ! Wait for last transfer to finish
858 : #ifdef WITH_MPI
859 69204 : if (wantDebug) call obj%timer%start("mpi_communication")
860 :
861 69204 : call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
862 69204 : if (wantDebug) call obj%timer%stop("mpi_communication")
863 : #endif
864 : ! Copy vectors into send buffer
865 107760 : hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
866 : ! Send to destination
867 :
868 : #ifdef WITH_MPI
869 69204 : if (wantDebug) call obj%timer%start("mpi_communication")
870 : call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), &
871 : #if REALCASE == 1
872 : MPI_REAL_PRECISION, &
873 : #endif
874 : #if COMPLEXCASE == 1
875 : MPI_COMPLEX_EXPLICIT_PRECISION, &
876 : #endif
877 : global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
878 69204 : 10+iblk, communicator, ireq_hhs(iblk), mpierr)
879 69204 : if (wantDebug) call obj%timer%stop("mpi_communication")
880 : #else /* WITH_MPI */
881 : ! do the post-poned irecv here
882 38556 : startAddr = startAddr - hh_cnt(iblk)
883 38556 : hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
884 : #endif /* WITH_MPI */
885 :
886 : ! Reset counter and increase destination row
887 107760 : hh_cnt(iblk) = 0
888 107760 : hh_dst(iblk) = hh_dst(iblk)+1
889 : endif
890 :
891 : ! The following code is structured in a way to keep waiting times for
892 : ! other PEs at a minimum, especially if there is only one block.
893 : ! For this reason, it requests the last column as late as possible
894 : ! and sends the Householder Vector and the first column as early
895 : ! as possible.
896 : #endif /* WITH_OPENMP */
897 28668720 : nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
898 28668720 : nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
899 : ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
900 :
901 : ! Multiply diagonal block and subdiagonal block with Householder Vector
902 :
903 28668720 : if (iblk==nblocks .and. nc==nb) then
904 :
905 : ! We need the last column from the next PE.
906 : ! First do the matrix multiplications without last column ...
907 :
908 : ! Diagonal block, the contribution of the last element is added below!
909 1664304 : ab(1,ne) = 0.0_rck
910 1664304 : if (wantDebug) call obj%timer%start("blas")
911 :
912 : #if REALCASE == 1
913 971472 : call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
914 : #endif
915 : #if COMPLEXCASE == 1
916 692832 : call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd,1)
917 : #endif
918 : ! Subdiagonal block
919 1664304 : if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
920 1664304 : if (wantDebug) call obj%timer%stop("blas")
921 :
922 : ! ... then request last column ...
923 : #ifdef WITH_MPI
924 1664304 : if (wantDebug) call obj%timer%start("mpi_communication")
925 : #ifdef WITH_OPENMP
926 : call mpi_recv(ab(1,ne), nb+1, &
927 : #if REALCASE == 1
928 : MPI_REAL_PRECISION, &
929 : #endif
930 : #if COMPLEXCASE == 1
931 : MPI_COMPLEX_EXPLICIT_PRECISION, &
932 : #endif
933 832152 : my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
934 : #else /* WITH_OPENMP */
935 : call mpi_recv(ab(1,ne), nb+1, &
936 : #if REALCASE == 1
937 : MPI_REAL_PRECISION, &
938 : #endif
939 : #if COMPLEXCASE == 1
940 : MPI_COMPLEX_EXPLICIT_PRECISION, &
941 : #endif
942 832152 : my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
943 : #endif /* WITH_OPENMP */
944 1664304 : if (wantDebug) call obj%timer%stop("mpi_communication")
945 : #else /* WITH_MPI */
946 :
947 0 : ab(1:nb+1,ne) = ab_s(1:nb+1)
948 :
949 : #endif /* WITH_MPI */
950 :
951 : ! ... and complete the result
952 1664304 : hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
953 1664304 : hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
954 :
955 : else
956 :
957 : ! Normal matrix multiply
958 27004416 : if (wantDebug) call obj%timer%start("blas")
959 : #if REALCASE == 1
960 10217760 : call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
961 : #endif
962 : #if COMPLEXCASE == 1
963 16786656 : call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
964 : #endif
965 27004416 : if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
966 27004416 : if (wantDebug) call obj%timer%stop("blas")
967 : endif
968 :
969 : ! Calculate first column of subdiagonal block and calculate new
970 : ! Householder transformation for this column
971 28668720 : hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
972 28668720 : tau_new = 0.0_rck
973 28668720 : if (nr>0) then
974 :
975 : ! complete (old) Householder transformation for first column
976 :
977 22649952 : ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
978 :
979 : ! calculate new Householder transformation ...
980 22649952 : if (nr>1) then
981 : #if REALCASE == 1
982 7646976 : vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
983 : #endif
984 : #if COMPLEXCASE == 1
985 : #ifdef DOUBLE_PRECISION_COMPLEX
986 10884480 : vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
987 : #else
988 3995904 : vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
989 : #endif
990 : #endif /* COMPLEXCASE */
991 :
992 : #if REALCASE == 1
993 : call hh_transform_real_&
994 : #endif
995 : #if COMPLEXCASE == 1
996 : call hh_transform_complex_&
997 : #endif
998 : &PRECISION &
999 22527360 : (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug)
1000 22527360 : hv_new(1) = 1.0_rck
1001 22527360 : hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
1002 22527360 : ab(nb+2:,ns) = 0.0_rck
1003 : endif ! nr > 1
1004 :
1005 : ! ... and send it away immediatly if this is the last block
1006 :
1007 22649952 : if (iblk==nblocks) then
1008 : #ifdef WITH_MPI
1009 1648488 : if (wantDebug) call obj%timer%start("mpi_communication")
1010 : #ifdef WITH_OPENMP
1011 824244 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
1012 : #else
1013 824244 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
1014 : #endif
1015 1648488 : if (wantDebug) call obj%timer%stop("mpi_communication")
1016 :
1017 : #endif /* WITH_MPI */
1018 1648488 : hv_s(1) = tau_new
1019 1648488 : hv_s(2:) = hv_new(2:)
1020 :
1021 : #ifdef WITH_MPI
1022 1648488 : if (wantDebug) call obj%timer%start("mpi_communication")
1023 : call mpi_isend(hv_s, nb, &
1024 : #if REALCASE == 1
1025 : MPI_REAL_PRECISION, &
1026 : #endif
1027 : #if COMPLEXCASE == 1
1028 : MPI_COMPLEX_EXPLICIT_PRECISION, &
1029 : #endif
1030 1648488 : my_pe+1, 2, communicator, ireq_hv, mpierr)
1031 1648488 : if (wantDebug) call obj%timer%stop("mpi_communication")
1032 :
1033 : #endif /* WITH_MPI */
1034 : endif
1035 :
1036 : endif
1037 :
1038 : ! Transform diagonal block
1039 : #if REALCASE == 1
1040 11189232 : x = dot_product(hv(1:nc),hd(1:nc))*tau
1041 : #endif
1042 : #if COMPLEXCASE == 1
1043 17479488 : x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
1044 : #endif
1045 28668720 : hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
1046 28668720 : if (my_pe>0 .and. iblk==1) then
1047 :
1048 : ! The first column of the diagonal block has to be send to the previous PE
1049 : ! Calculate first column only ...
1050 : #if REALCASE == 1
1051 961944 : ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
1052 : #endif
1053 : #if COMPLEXCASE == 1
1054 686544 : ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
1055 : #endif
1056 : ! ... send it away ...
1057 : #ifdef WITH_MPI
1058 1648488 : if (wantDebug) call obj%timer%start("mpi_communication")
1059 :
1060 : #ifdef WITH_OPENMP
1061 824244 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
1062 : #else
1063 824244 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
1064 : #endif
1065 1648488 : if (wantDebug) call obj%timer%stop("mpi_communication")
1066 :
1067 : #endif /* WITH_MPI */
1068 1648488 : ab_s(1:nb+1) = ab(1:nb+1,ns)
1069 :
1070 : #ifdef WITH_MPI
1071 1648488 : if (wantDebug) call obj%timer%start("mpi_communication")
1072 :
1073 : call mpi_isend(ab_s, nb+1, &
1074 : #if REALCASE == 1
1075 : MPI_REAL_PRECISION, &
1076 : #endif
1077 : #if COMPLEXCASE == 1
1078 : MPI_COMPLEX_EXPLICIT_PRECISION, &
1079 : #endif
1080 1648488 : my_pe-1, 1, communicator, ireq_ab, mpierr)
1081 1648488 : if (wantDebug) call obj%timer%stop("mpi_communication")
1082 :
1083 : #endif /* WITH_MPI */
1084 : ! ... and calculate remaining columns with rank-2 update
1085 1648488 : if (wantDebug) call obj%timer%start("blas")
1086 : #if REALCASE == 1
1087 961944 : if (nc>1) call PRECISION_SYR2('L', nc-1, -ONE, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
1088 : #endif
1089 : #if COMPLEXCASE == 1
1090 686544 : if (nc>1) call PRECISION_HER2('L', nc-1, -ONE, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
1091 : #endif
1092 1648488 : if (wantDebug) call obj%timer%stop("blas")
1093 :
1094 : else
1095 : ! No need to send, just a rank-2 update
1096 27020232 : if (wantDebug) call obj%timer%start("blas")
1097 : #if REALCASE == 1
1098 10227288 : call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
1099 : #endif
1100 : #if COMPLEXCASE == 1
1101 16792944 : call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
1102 : #endif
1103 27020232 : if (wantDebug) call obj%timer%stop("blas")
1104 :
1105 : endif
1106 :
1107 : ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
1108 :
1109 28668720 : if (nr>0) then
1110 22649952 : if (nr>1) then
1111 22527360 : if (wantDebug) call obj%timer%start("blas")
1112 : call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, &
1113 22527360 : hv_new, 1, ZERO, h(2), 1)
1114 22527360 : if (wantDebug) call obj%timer%stop("blas")
1115 :
1116 22527360 : x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
1117 22527360 : h(2:nb) = h(2:nb) - x*hv(2:nb)
1118 : ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
1119 954298368 : do i=2,nb
1120 : #if REALCASE == 1
1121 473299200 : ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
1122 : #endif
1123 : #if COMPLEXCASE == 1
1124 458471808 : ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
1125 : #endif
1126 : enddo
1127 : else
1128 : ! No double Householder transformation for nr=1, just complete the row
1129 5360640 : do i=2,nb
1130 : #if REALCASE == 1
1131 3032640 : ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
1132 : #endif
1133 : #if COMPLEXCASE == 1
1134 2205408 : ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
1135 : #endif
1136 : enddo
1137 : endif
1138 : endif
1139 :
1140 : ! Use new HH Vector for the next block
1141 28668720 : hv(:) = hv_new(:)
1142 28668720 : tau = tau_new
1143 :
1144 : enddo
1145 :
1146 : #ifdef WITH_OPENMP
1147 : endif
1148 : #endif
1149 :
1150 : #if WITH_OPENMP
1151 18167988 : do iblk = 1, nblocks
1152 :
1153 16613784 : if (hh_dst(iblk) >= np_rows) exit
1154 14735256 : if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
1155 :
1156 14334360 : if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
1157 : ! Wait for last transfer to finish
1158 : #ifdef WITH_MPI
1159 69204 : if (wantDebug) call obj%timer%start("mpi_communication")
1160 69204 : call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
1161 69204 : if (wantDebug) call obj%timer%stop("mpi_communication")
1162 : #endif
1163 : ! Copy vectors into send buffer
1164 107760 : hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
1165 : ! Send to destination
1166 :
1167 : #ifdef WITH_MPI
1168 69204 : if (wantDebug) call obj%timer%start("mpi_communication")
1169 : call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), &
1170 : #if REALCASE == 1
1171 : MPI_REAL_PRECISION, &
1172 : #endif
1173 : #if COMPLEXCASE == 1
1174 : MPI_COMPLEX_EXPLICIT_PRECISION, &
1175 : #endif
1176 : global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
1177 69204 : 10+iblk, communicator, ireq_hhs(iblk), mpierr)
1178 69204 : if (wantDebug) call obj%timer%stop("mpi_communication")
1179 : #else /* WITH_MPI */
1180 : ! do the post-poned irecv here
1181 38556 : startAddr = startAddr - hh_cnt(iblk)
1182 38556 : hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
1183 : #endif /* WITH_MPI */
1184 :
1185 : ! Reset counter and increase destination row
1186 107760 : hh_cnt(iblk) = 0
1187 107760 : hh_dst(iblk) = hh_dst(iblk)+1
1188 : endif
1189 :
1190 : enddo
1191 : #endif /* WITH_OPENMP */
1192 : enddo ! istep
1193 :
1194 : ! Finish the last outstanding requests
1195 :
1196 : #ifdef WITH_OPENMP
1197 :
1198 : #ifdef WITH_MPI
1199 15816 : if (wantDebug) call obj%timer%start("mpi_communication")
1200 15816 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
1201 15816 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
1202 :
1203 : ! allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
1204 : ! if (istat .ne. 0) then
1205 : ! print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage
1206 : ! stop 1
1207 : ! endif
1208 :
1209 15816 : call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
1210 15816 : call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
1211 : ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
1212 : ! if (istat .ne. 0) then
1213 : ! print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage
1214 : ! stop 1
1215 : ! endif
1216 15816 : if (wantDebug) call obj%timer%stop("mpi_communication")
1217 : #endif /* WITH_MPI */
1218 :
1219 : #else /* WITH_OPENMP */
1220 :
1221 : #ifdef WITH_MPI
1222 15816 : if (wantDebug) call obj%timer%start("mpi_communication")
1223 15816 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
1224 15816 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
1225 :
1226 15816 : call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
1227 15816 : call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
1228 15816 : if (wantDebug) call obj%timer%stop("mpi_communication")
1229 : #endif
1230 :
1231 : #endif /* WITH_OPENMP */
1232 :
1233 : #ifdef WITH_MPI
1234 31632 : if (wantDebug) call obj%timer%start("mpi_communication")
1235 31632 : call mpi_barrier(communicator,mpierr)
1236 31632 : if (wantDebug) call obj%timer%stop("mpi_communication")
1237 : #endif
1238 47448 : deallocate(ab, stat=istat, errmsg=errorMessage)
1239 47448 : if (istat .ne. 0) then
1240 : print *,"tridiag_band_&
1241 : &MATH_DATATYPE&
1242 0 : &: error when deallocating ab"//errorMessage
1243 0 : stop 1
1244 : endif
1245 :
1246 47448 : deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
1247 47448 : if (istat .ne. 0) then
1248 : print *,"tridiag_band_&
1249 : &MATH_DATATYPE&
1250 0 : &: error when deallocating ireq_hhr, ireq_hhs"//errorMessage
1251 0 : stop 1
1252 : endif
1253 :
1254 47448 : deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
1255 47448 : if (istat .ne. 0) then
1256 : print *,"tridiag_band_&
1257 : &MATH_DATATYPE&
1258 0 : &: error when deallocating hh_cnt, hh_dst"//errorMessage
1259 0 : stop 1
1260 : endif
1261 :
1262 47448 : deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
1263 47448 : if (istat .ne. 0) then
1264 : print *,"tridiag_band_&
1265 : &MATH_DATATYPE&
1266 0 : &: error when deallocating hh_gath, hh_send"//errorMessage
1267 0 : stop 1
1268 : endif
1269 :
1270 47448 : deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
1271 47448 : if (istat .ne. 0) then
1272 : print *,"tridiag_band_&
1273 : &MATH_DATATYPE&
1274 0 : &: error when deallocating limits, send_limits"//errorMessage
1275 0 : stop 1
1276 : endif
1277 :
1278 47448 : deallocate(block_limits, stat=istat, errmsg=errorMessage)
1279 47448 : if (istat .ne. 0) then
1280 : print *,"tridiag_band_&
1281 : &MATH_DATATYPE&
1282 0 : &: error when deallocating block_limits"//errorMessage
1283 0 : stop 1
1284 : endif
1285 :
1286 47448 : deallocate(global_id, stat=istat, errmsg=errorMessage)
1287 47448 : if (istat .ne. 0) then
1288 : print *,"tridiag_band_&
1289 : &MATH_DATATYPE&
1290 0 : &: error when allocating global_id"//errorMessage
1291 0 : stop 1
1292 : endif
1293 :
1294 : call obj%timer%stop("tridiag_band_&
1295 : &MATH_DATATYPE&
1296 : &" // &
1297 : &PRECISION_SUFFIX&
1298 47448 : )
1299 :
1300 : ! intel compiler bug makes these ifdefs necessary
1301 : #if REALCASE == 1
1302 : end subroutine tridiag_band_real_&
1303 : #endif
1304 : #if COMPLEXCASE == 1
1305 : end subroutine tridiag_band_complex_&
1306 : #endif
1307 189792 : &PRECISION
1308 :
|