Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : !
47 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48 : !
49 : ! Copyright of the original code rests with the authors inside the ELPA
50 : ! consortium. The copyright of any additional modifications shall rest
51 : ! with their original authors, but shall adhere to the licensing terms
52 : ! distributed along with the original code in the file "COPYING".
53 : #endif
54 :
55 : #include "../general/sanity.F90"
56 :
57 : subroutine merge_systems_&
58 39504 : &PRECISION &
59 79008 : (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
60 79008 : l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success)
61 : use cuda_functions
62 : use iso_c_binding
63 : use precision
64 : use elpa_abstract_impl
65 : implicit none
66 : #include "../general/precision_kinds.F90"
67 : class(elpa_abstract_impl_t), intent(inout) :: obj
68 : integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
69 : mpi_comm_cols, npc_0, npc_n
70 : integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
71 : real(kind=REAL_DATATYPE), intent(inout) :: d(na), e
72 : #ifdef USE_ASSUMED_SIZE
73 : real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
74 : #else
75 : real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
76 : #endif
77 : logical, intent(in) :: useGPU, wantDebug
78 : logical, intent(out) :: success
79 :
80 : ! TODO: play with max_strip. If it was larger, matrices being multiplied
81 : ! might be larger as well!
82 : integer(kind=ik), parameter :: max_strip=128
83 :
84 : real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
85 : real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
86 : qtrans(2,2), dmax, zmax, d1new, d2new
87 316032 : real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
88 237024 : dbase(na), ddiff(na), ev_scale(na), tmp(na)
89 158016 : real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
90 79008 : real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
91 : #ifdef WITH_OPENMP
92 19752 : real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
93 : #endif
94 :
95 : integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
96 : l_rqm, ns, info
97 : integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
98 : l_cols_qreorg, np, l_idx, nqcols1, nqcols2
99 : integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
100 : np_cols, mpierr
101 : integer(kind=ik) :: np_next, np_prev, np_rem
102 79008 : integer(kind=ik) :: idx(na), idx1(na), idx2(na)
103 158016 : integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
104 :
105 : integer(kind=ik) :: istat
106 : character(200) :: errorMessage
107 : integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
108 :
109 : integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
110 : logical :: successCUDA
111 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
112 : &PRECISION&
113 : &_real
114 : #ifdef WITH_OPENMP
115 : integer(kind=ik) :: max_threads, my_thread
116 : integer(kind=ik) :: omp_get_max_threads, omp_get_thread_num
117 :
118 :
119 19752 : max_threads = omp_get_max_threads()
120 :
121 19752 : allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
122 19752 : if (istat .ne. 0) then
123 0 : print *,"merge_systems: error when allocating z_p "//errorMessage
124 0 : stop 1
125 : endif
126 : #endif
127 :
128 39504 : call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
129 39504 : success = .true.
130 39504 : call obj%timer%start("mpi_communication")
131 39504 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
132 39504 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
133 39504 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
134 39504 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
135 39504 : call obj%timer%stop("mpi_communication")
136 :
137 : ! If my processor column isn't in the requested set, do nothing
138 :
139 39504 : if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
140 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
141 0 : return
142 : endif
143 : ! Determine number of "next" and "prev" column for ring sends
144 :
145 39504 : if (my_pcol == npc_0+npc_n-1) then
146 39504 : np_next = npc_0
147 : else
148 0 : np_next = my_pcol + 1
149 : endif
150 :
151 39504 : if (my_pcol == npc_0) then
152 39504 : np_prev = npc_0+npc_n-1
153 : else
154 0 : np_prev = my_pcol - 1
155 : endif
156 : call check_monotony_&
157 : &PRECISION&
158 39504 : &(obj, nm,d,'Input1',wantDebug, success)
159 39504 : if (.not.(success)) then
160 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
161 0 : return
162 : endif
163 : call check_monotony_&
164 : &PRECISION&
165 39504 : &(obj,na-nm,d(nm+1),'Input2',wantDebug, success)
166 39504 : if (.not.(success)) then
167 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
168 0 : return
169 : endif
170 : ! Get global number of processors and my processor number.
171 : ! Please note that my_proc does not need to match any real processor number,
172 : ! it is just used for load balancing some loops.
173 :
174 39504 : n_procs = np_rows*npc_n
175 39504 : my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
176 :
177 :
178 : ! Local limits of the rows of Q
179 :
180 39504 : l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
181 39504 : l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
182 39504 : l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
183 :
184 39504 : l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
185 39504 : l_rows = l_rqe-l_rqs+1 ! Total number of local rows
186 :
187 :
188 : ! My number of local columns
189 :
190 39504 : l_cols = COUNT(p_col(1:na)==my_pcol)
191 :
192 : ! Get max number of local columns
193 :
194 39504 : max_local_cols = 0
195 79008 : do np = npc_0, npc_0+npc_n-1
196 39504 : max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
197 : enddo
198 :
199 : ! Calculations start here
200 :
201 39504 : beta = abs(e)
202 39504 : sig = sign(1.0_rk,e)
203 :
204 : ! Calculate rank-1 modifier z
205 :
206 39504 : z(:) = 0
207 :
208 39504 : if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
209 : ! nm is local on my row
210 5198376 : do i = 1, na
211 5178000 : if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
212 : enddo
213 : endif
214 :
215 39504 : if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
216 : ! nm+1 is local on my row
217 5198376 : do i = 1, na
218 5178000 : if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
219 : enddo
220 : endif
221 :
222 : call global_gather_&
223 : &PRECISION&
224 39504 : &(obj, z, na)
225 : ! Normalize z so that norm(z) = 1. Since z is the concatenation of
226 : ! two normalized vectors, norm2(z) = sqrt(2).
227 39504 : z = z/sqrt(2.0_rk)
228 39504 : rho = 2.0_rk*beta
229 : ! Calculate index for merging both systems by ascending eigenvalues
230 39504 : call obj%timer%start("blas")
231 39504 : call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
232 39504 : call obj%timer%stop("blas")
233 :
234 : ! Calculate the allowable deflation tolerance
235 :
236 39504 : zmax = maxval(abs(z))
237 39504 : dmax = maxval(abs(d))
238 39504 : EPS = PRECISION_LAMCH( 'Epsilon' )
239 39504 : TOL = 8.0_rk*EPS*MAX(dmax,zmax)
240 :
241 : ! If the rank-1 modifier is small enough, no more needs to be done
242 : ! except to reorganize D and Q
243 :
244 39504 : IF ( RHO*zmax <= TOL ) THEN
245 :
246 : ! Rearrange eigenvalues
247 :
248 0 : tmp = d
249 0 : do i=1,na
250 0 : d(i) = tmp(idx(i))
251 : enddo
252 :
253 : ! Rearrange eigenvectors
254 : call resort_ev_&
255 : &PRECISION &
256 0 : (obj, idx, na)
257 :
258 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
259 :
260 0 : return
261 : ENDIF
262 :
263 : ! Merge and deflate system
264 :
265 39504 : na1 = 0
266 39504 : na2 = 0
267 :
268 : ! COLTYP:
269 : ! 1 : non-zero in the upper half only;
270 : ! 2 : dense;
271 : ! 3 : non-zero in the lower half only;
272 : ! 4 : deflated.
273 :
274 39504 : coltyp(1:nm) = 1
275 39504 : coltyp(nm+1:na) = 3
276 :
277 9147504 : do i=1,na
278 :
279 9108000 : if (rho*abs(z(idx(i))) <= tol) then
280 :
281 : ! Deflate due to small z component.
282 :
283 919536 : na2 = na2+1
284 919536 : d2(na2) = d(idx(i))
285 919536 : idx2(na2) = idx(i)
286 919536 : coltyp(idx(i)) = 4
287 :
288 8188464 : else if (na1>0) then
289 :
290 : ! Check if eigenvalues are close enough to allow deflation.
291 :
292 8148960 : S = Z(idx(i))
293 8148960 : C = Z1(na1)
294 :
295 : ! Find sqrt(a**2+b**2) without overflow or
296 : ! destructive underflow.
297 8148960 : TAU = PRECISION_LAPY2( C, S )
298 8148960 : T = D1(na1) - D(idx(i))
299 8148960 : C = C / TAU
300 8148960 : S = -S / TAU
301 8148960 : IF ( ABS( T*C*S ) <= TOL ) THEN
302 :
303 : ! Deflation is possible.
304 :
305 45024 : na2 = na2+1
306 :
307 45024 : Z1(na1) = TAU
308 :
309 45024 : d2new = D(idx(i))*C**2 + D1(na1)*S**2
310 45024 : d1new = D(idx(i))*S**2 + D1(na1)*C**2
311 :
312 : ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
313 : ! This means that after the above transformation it must be
314 : ! D1(na1) <= d1new <= D(idx(i))
315 : ! D1(na1) <= d2new <= D(idx(i))
316 : !
317 : ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
318 : ! so there is no problem with sorting here.
319 : ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
320 : ! which makes a check (and possibly a resort) necessary.
321 : !
322 : ! The above relations may not hold exactly due to numeric differences
323 : ! so they have to be enforced in order not to get troubles with sorting.
324 :
325 :
326 45024 : if (d1new<D1(na1) ) d1new = D1(na1)
327 45024 : if (d1new>D(idx(i))) d1new = D(idx(i))
328 :
329 45024 : if (d2new<D1(na1) ) d2new = D1(na1)
330 45024 : if (d2new>D(idx(i))) d2new = D(idx(i))
331 :
332 45024 : D1(na1) = d1new
333 :
334 45024 : do j=na2-1,1,-1
335 45024 : if (d2new<d2(j)) then
336 0 : d2(j+1) = d2(j)
337 0 : idx2(j+1) = idx2(j)
338 : else
339 45024 : exit ! Loop
340 : endif
341 : enddo
342 :
343 45024 : d2(j+1) = d2new
344 45024 : idx2(j+1) = idx(i)
345 :
346 45024 : qtrans(1,1) = C; qtrans(1,2) =-S
347 45024 : qtrans(2,1) = S; qtrans(2,2) = C
348 : call transform_columns_&
349 : &PRECISION &
350 45024 : (obj, idx(i), idx1(na1))
351 45024 : if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
352 45024 : if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
353 :
354 45024 : coltyp(idx(i)) = 4
355 :
356 : else
357 8103936 : na1 = na1+1
358 8103936 : d1(na1) = d(idx(i))
359 8103936 : z1(na1) = z(idx(i))
360 8103936 : idx1(na1) = idx(i)
361 : endif
362 : else
363 39504 : na1 = na1+1
364 39504 : d1(na1) = d(idx(i))
365 39504 : z1(na1) = z(idx(i))
366 39504 : idx1(na1) = idx(i)
367 : endif
368 :
369 : enddo
370 : call check_monotony_&
371 : &PRECISION&
372 39504 : &(obj, na1,d1,'Sorted1', wantDebug, success)
373 39504 : if (.not.(success)) then
374 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
375 0 : return
376 : endif
377 : call check_monotony_&
378 : &PRECISION&
379 39504 : &(obj, na2,d2,'Sorted2', wantDebug, success)
380 39504 : if (.not.(success)) then
381 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
382 0 : return
383 : endif
384 :
385 39504 : if (na1==1 .or. na1==2) then
386 : ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
387 :
388 0 : if (na1==1) then
389 0 : d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
390 : else ! na1==2
391 0 : call obj%timer%start("blas")
392 0 : call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
393 0 : call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
394 0 : call obj%timer%stop("blas")
395 : call transform_columns_&
396 : &PRECISION&
397 0 : &(obj, idx1(1), idx1(2))
398 : endif
399 :
400 : ! Add the deflated eigenvalues
401 0 : d(na1+1:na) = d2(1:na2)
402 :
403 : ! Calculate arrangement of all eigenvalues in output
404 0 : call obj%timer%start("blas")
405 0 : call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
406 0 : call obj%timer%stop("blas")
407 : ! Rearrange eigenvalues
408 :
409 0 : tmp = d
410 0 : do i=1,na
411 0 : d(i) = tmp(idx(i))
412 : enddo
413 :
414 : ! Rearrange eigenvectors
415 :
416 0 : do i=1,na
417 0 : if (idx(i)<=na1) then
418 0 : idxq1(i) = idx1(idx(i))
419 : else
420 0 : idxq1(i) = idx2(idx(i)-na1)
421 : endif
422 : enddo
423 : call resort_ev_&
424 : &PRECISION&
425 0 : &(obj, idxq1, na)
426 39504 : else if (na1>2) then
427 :
428 : ! Solve secular equation
429 :
430 39504 : z(1:na1) = 1
431 : #ifdef WITH_OPENMP
432 19752 : z_p(1:na1,:) = 1
433 : #endif
434 39504 : dbase(1:na1) = 0
435 39504 : ddiff(1:na1) = 0
436 :
437 39504 : info = 0
438 : #ifdef WITH_OPENMP
439 :
440 19752 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
441 :
442 39504 : !$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
443 19752 : my_thread = omp_get_thread_num()
444 19752 : !$OMP DO
445 : #endif
446 2325804 : DO i = my_proc+1, na1, n_procs ! work distributed over all processors
447 4612104 : call obj%timer%start("blas")
448 4612104 : call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
449 4612104 : call obj%timer%stop("blas")
450 4612104 : if (info/=0) then
451 : ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
452 : ! use the more stable bisection algorithm in solve_secular_equation
453 : ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
454 : call solve_secular_equation_&
455 : &PRECISION&
456 0 : &(obj, na1, i, d1, z1, delta, rho, s)
457 : endif
458 :
459 : ! Compute updated z
460 :
461 : #ifdef WITH_OPENMP
462 1109017344 : do j=1,na1
463 2211116532 : if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
464 : enddo
465 2306052 : z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
466 : #else
467 1109017344 : do j=1,na1
468 1106711292 : if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
469 : enddo
470 2306052 : z(i) = z(i)*delta(i)
471 : #endif
472 : ! store dbase/ddiff
473 :
474 4612104 : if (i<na1) then
475 4591728 : if (abs(delta(i+1)) < abs(delta(i))) then
476 2147352 : dbase(i) = d1(i+1)
477 2147352 : ddiff(i) = delta(i+1)
478 : else
479 2444376 : dbase(i) = d1(i)
480 2444376 : ddiff(i) = delta(i)
481 : endif
482 : else
483 20376 : dbase(i) = d1(i)
484 20376 : ddiff(i) = delta(i)
485 : endif
486 : enddo
487 : #ifdef WITH_OPENMP
488 : !$OMP END PARALLEL
489 :
490 19752 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
491 :
492 39504 : do i = 0, max_threads-1
493 19752 : z(1:na1) = z(1:na1)*z_p(1:na1,i)
494 : enddo
495 : #endif
496 :
497 : call global_product_&
498 : &PRECISION&
499 39504 : (obj, z, na1)
500 39504 : z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
501 :
502 : call global_gather_&
503 : &PRECISION&
504 39504 : &(obj, dbase, na1)
505 : call global_gather_&
506 : &PRECISION&
507 39504 : &(obj, ddiff, na1)
508 39504 : d(1:na1) = dbase(1:na1) - ddiff(1:na1)
509 :
510 : ! Calculate scale factors for eigenvectors
511 39504 : ev_scale(:) = 0.0_rk
512 :
513 : #ifdef WITH_OPENMP
514 :
515 19752 : call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
516 :
517 : !$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, &
518 : !$OMP d1,dbase, ddiff, z, ev_scale, obj) &
519 2325804 : !$OMP DEFAULT(NONE)
520 :
521 : #endif
522 2325804 : DO i = my_proc+1, na1, n_procs ! work distributed over all processors
523 :
524 : ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
525 : ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
526 :
527 : ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
528 : ! in exactly this order, but we want to prevent compiler optimization
529 : ! ev_scale_val = ev_scale(i)
530 : call add_tmp_&
531 : &PRECISION&
532 4651608 : &(obj, d1, dbase, ddiff, z, ev_scale(i), na1,i)
533 : ! ev_scale(i) = ev_scale_val
534 : enddo
535 : #ifdef WITH_OPENMP
536 : !$OMP END PARALLEL DO
537 :
538 19752 : call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
539 :
540 : #endif
541 :
542 : call global_gather_&
543 : &PRECISION&
544 39504 : &(obj, ev_scale, na1)
545 : ! Add the deflated eigenvalues
546 39504 : d(na1+1:na) = d2(1:na2)
547 :
548 39504 : call obj%timer%start("blas")
549 : ! Calculate arrangement of all eigenvalues in output
550 39504 : call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
551 39504 : call obj%timer%stop("blas")
552 : ! Rearrange eigenvalues
553 39504 : tmp = d
554 9147504 : do i=1,na
555 9108000 : d(i) = tmp(idx(i))
556 : enddo
557 : call check_monotony_&
558 : &PRECISION&
559 39504 : &(obj, na,d,'Output', wantDebug, success)
560 :
561 39504 : if (.not.(success)) then
562 0 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
563 0 : return
564 : endif
565 : ! Eigenvector calculations
566 :
567 :
568 : ! Calculate the number of columns in the new local matrix Q
569 : ! which are updated from non-deflated/deflated eigenvectors.
570 : ! idxq1/2 stores the global column numbers.
571 :
572 39504 : nqcols1 = 0 ! number of non-deflated eigenvectors
573 39504 : nqcols2 = 0 ! number of deflated eigenvectors
574 9147504 : DO i = 1, na
575 9108000 : if (p_col_out(i)==my_pcol) then
576 7236000 : if (idx(i)<=na1) then
577 6526704 : nqcols1 = nqcols1+1
578 6526704 : idxq1(nqcols1) = i
579 : else
580 709296 : nqcols2 = nqcols2+1
581 709296 : idxq2(nqcols2) = i
582 : endif
583 : endif
584 : enddo
585 :
586 39504 : gemm_dim_k = MAX(1,l_rows)
587 39504 : gemm_dim_l = max_local_cols
588 39504 : gemm_dim_m = MIN(max_strip,MAX(1,nqcols1))
589 :
590 39504 : allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errorMessage)
591 39504 : if (istat .ne. 0) then
592 0 : print *,"merge_systems: error when allocating qtmp1 "//errorMessage
593 0 : stop 1
594 : endif
595 :
596 39504 : allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errorMessage)
597 39504 : if (istat .ne. 0) then
598 0 : print *,"merge_systems: error when allocating ev "//errorMessage
599 0 : stop 1
600 : endif
601 :
602 39504 : allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errorMessage)
603 39504 : if (istat .ne. 0) then
604 0 : print *,"merge_systems: error when allocating qtmp2 "//errorMessage
605 0 : stop 1
606 : endif
607 :
608 39504 : qtmp1 = 0 ! May contain empty (unset) parts
609 39504 : qtmp2 = 0 ! Not really needed
610 :
611 39504 : if (useGPU) then
612 0 : successCUDA = cuda_malloc(qtmp1_dev, gemm_dim_k * gemm_dim_l * size_of_datatype)
613 0 : check_alloc_cuda("merge_systems: qtmp1_dev", successCUDA)
614 :
615 0 : successCUDA = cuda_malloc(ev_dev, gemm_dim_l * gemm_dim_m * size_of_datatype)
616 0 : check_alloc_cuda("merge_systems: ev_dev", successCUDA)
617 :
618 0 : successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
619 0 : check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
620 :
621 0 : successCUDA = cuda_memset(qtmp1_dev, 0, gemm_dim_k * gemm_dim_l * size_of_datatype)
622 0 : check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
623 :
624 0 : successCUDA = cuda_memset(qtmp2_dev, 0, gemm_dim_k * gemm_dim_m * size_of_datatype)
625 0 : check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
626 : endif
627 :
628 : ! Gather nonzero upper/lower components of old matrix Q
629 : ! which are needed for multiplication with new eigenvectors
630 :
631 39504 : nnzu = 0
632 39504 : nnzl = 0
633 8182944 : do i = 1, na1
634 8143440 : l_idx = l_col(idx1(i))
635 8143440 : if (p_col(idx1(i))==my_pcol) then
636 8143440 : if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
637 3914160 : nnzu = nnzu+1
638 3914160 : qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
639 : endif
640 8143440 : if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
641 4261776 : nnzl = nnzl+1
642 4261776 : qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
643 : endif
644 : endif
645 : enddo
646 :
647 : ! Gather deflated eigenvalues behind nonzero components
648 :
649 39504 : ndef = max(nnzu,nnzl)
650 1004064 : do i = 1, na2
651 964560 : l_idx = l_col(idx2(i))
652 964560 : if (p_col(idx2(i))==my_pcol) then
653 964560 : ndef = ndef+1
654 964560 : qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
655 : endif
656 : enddo
657 :
658 39504 : l_cols_qreorg = ndef ! Number of columns in reorganized matrix
659 :
660 : ! Set (output) Q to 0, it will sum up new Q
661 :
662 9147504 : DO i = 1, na
663 9108000 : if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
664 : enddo
665 :
666 39504 : np_rem = my_pcol
667 :
668 79008 : do np = 1, npc_n
669 : ! Do a ring send of qtmp1
670 :
671 39504 : if (np>1) then
672 :
673 0 : if (np_rem==npc_0) then
674 0 : np_rem = npc_0+npc_n-1
675 : else
676 0 : np_rem = np_rem-1
677 : endif
678 : #ifdef WITH_MPI
679 0 : call obj%timer%start("mpi_communication")
680 : call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL_PRECISION, &
681 : np_next, 1111, np_prev, 1111, &
682 0 : mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
683 0 : call obj%timer%stop("mpi_communication")
684 : #endif /* WITH_MPI */
685 : endif
686 :
687 39504 : if (useGPU) then
688 : successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
689 0 : gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
690 0 : check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
691 : endif
692 :
693 : ! Gather the parts in d1 and z which are fitting to qtmp1.
694 : ! This also delivers nnzu/nnzl for proc np_rem
695 :
696 39504 : nnzu = 0
697 39504 : nnzl = 0
698 8182944 : do i=1,na1
699 8143440 : if (p_col(idx1(i))==np_rem) then
700 8143440 : if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
701 3914160 : nnzu = nnzu+1
702 3914160 : d1u(nnzu) = d1(i)
703 3914160 : zu (nnzu) = z (i)
704 : endif
705 8143440 : if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
706 4261776 : nnzl = nnzl+1
707 4261776 : d1l(nnzl) = d1(i)
708 4261776 : zl (nnzl) = z (i)
709 : endif
710 : endif
711 : enddo
712 :
713 : ! Set the deflated eigenvectors in Q (comming from proc np_rem)
714 :
715 39504 : ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix
716 9147504 : do i = 1, na
717 9108000 : j = idx(i)
718 9108000 : if (j>na1) then
719 964560 : if (p_col(idx2(j-na1))==np_rem) then
720 964560 : ndef = ndef+1
721 964560 : if (p_col_out(i)==my_pcol) &
722 709296 : q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
723 : endif
724 : endif
725 : enddo
726 :
727 114624 : do ns = 0, nqcols1-1, max_strip ! strimining loop
728 :
729 75120 : ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
730 :
731 : ! Get partial result from (output) Q
732 :
733 6601824 : do i = 1, ncnt
734 6526704 : qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns)))
735 : enddo
736 :
737 : ! Compute eigenvectors of the rank-1 modified matrix.
738 : ! Parts for multiplying with upper half of Q:
739 :
740 75120 : call obj%timer%start("strange_loop")
741 6601824 : do i = 1, ncnt
742 6526704 : j = idx(idxq1(i+ns))
743 : ! Calculate the j-th eigenvector of the deflated system
744 : ! See above why we are doing it this way!
745 6526704 : tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
746 : call v_add_s_&
747 : &PRECISION&
748 6526704 : &(obj,tmp,nnzu,ddiff(j))
749 6526704 : ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
750 : enddo
751 75120 : call obj%timer%stop("strange_loop")
752 :
753 75120 : if(useGPU) then
754 : !TODO: it should be enough to copy l_rows x ncnt
755 : successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
756 0 : gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
757 0 : check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
758 :
759 : !TODO the previous loop could be possible to do on device and thus
760 : !copy less
761 : successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
762 0 : gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
763 0 : check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
764 : endif
765 :
766 : ! Multiply old Q with eigenvectors (upper half)
767 :
768 75120 : if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
769 75120 : if (useGPU) then
770 0 : call obj%timer%start("cublas")
771 : call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
772 : 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
773 : ev_dev, ubound(ev,dim=1), &
774 0 : 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1))
775 0 : call obj%timer%stop("cublas")
776 : else
777 75120 : call obj%timer%start("blas")
778 75120 : call obj%timer%start("gemm")
779 : call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
780 : 1.0_rk, qtmp1, ubound(qtmp1,dim=1), &
781 : ev, ubound(ev,dim=1), &
782 75120 : 1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
783 75120 : call obj%timer%stop("gemm")
784 75120 : call obj%timer%stop("blas")
785 : endif ! useGPU
786 : endif
787 :
788 : if(useGPU) then
789 : !TODO: it should be enough to copy l_rows x ncnt
790 : !TODO: actually this will be done after the second mutiplication
791 :
792 : !TODO or actually maybe I should copy the half of the qtmp2 array
793 : !here and the rest after the next gemm
794 : !TODO either copy only half of the matrix here, and half after the
795 : !second gemm, or copy whole array after the next gemm
796 :
797 : ! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
798 : ! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
799 : ! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
800 : endif
801 :
802 : ! Compute eigenvectors of the rank-1 modified matrix.
803 : ! Parts for multiplying with lower half of Q:
804 :
805 75120 : call obj%timer%start("strange_loop")
806 6601824 : do i = 1, ncnt
807 6526704 : j = idx(idxq1(i+ns))
808 : ! Calculate the j-th eigenvector of the deflated system
809 : ! See above why we are doing it this way!
810 6526704 : tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
811 : call v_add_s_&
812 : &PRECISION&
813 6526704 : &(obj,tmp,nnzl,ddiff(j))
814 6526704 : ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
815 : enddo
816 75120 : call obj%timer%stop("strange_loop")
817 :
818 75120 : if(useGPU) then
819 : !TODO the previous loop could be possible to do on device and thus
820 : !copy less
821 : successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
822 0 : gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
823 0 : check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
824 : endif
825 :
826 : ! Multiply old Q with eigenvectors (lower half)
827 :
828 75120 : if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
829 75120 : if (useGPU) then
830 0 : call obj%timer%start("cublas")
831 : call cublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
832 : 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
833 : ev_dev, ubound(ev,dim=1), &
834 0 : 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1))
835 0 : call obj%timer%stop("cublas")
836 : else
837 75120 : call obj%timer%start("blas")
838 75120 : call obj%timer%start("gemm")
839 : call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
840 : 1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), &
841 : ev, ubound(ev,dim=1), &
842 75120 : 1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
843 75120 : call obj%timer%stop("gemm")
844 75120 : call obj%timer%stop("blas")
845 : endif ! useGPU
846 : endif
847 :
848 75120 : if(useGPU) then
849 : !TODO either copy only half of the matrix here, and get rid of the
850 : !previous copy or copy whole array here
851 : successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
852 0 : gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
853 0 : check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
854 : endif
855 :
856 : ! Put partial result into (output) Q
857 :
858 6601824 : do i = 1, ncnt
859 6526704 : q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
860 : enddo
861 :
862 : enddo !ns = 0, nqcols1-1, max_strip ! strimining loop
863 : enddo !do np = 1, npc_n
864 :
865 39504 : deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
866 39504 : if (istat .ne. 0) then
867 0 : print *,"merge_systems: error when deallocating ev "//errorMessage
868 0 : stop 1
869 : endif
870 :
871 39504 : if(useGPU) then
872 0 : successCUDA = cuda_free(qtmp1_dev)
873 0 : check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA)
874 0 : successCUDA = cuda_free(qtmp2_dev)
875 0 : check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA)
876 0 : successCUDA = cuda_free(ev_dev)
877 0 : check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
878 : endif
879 :
880 : endif !very outer test (na1==1 .or. na1==2)
881 : #ifdef WITH_OPENMP
882 19752 : deallocate(z_p, stat=istat, errmsg=errorMessage)
883 19752 : if (istat .ne. 0) then
884 0 : print *,"merge_systems: error when deallocating z_p "//errorMessage
885 0 : stop 1
886 : endif
887 : #endif
888 :
889 39504 : call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
890 :
891 138264 : return
892 :
893 : contains
894 : subroutine add_tmp_&
895 4612104 : &PRECISION&
896 4612104 : &(obj, d1, dbase, ddiff, z, ev_scale_value, na1,i)
897 : use precision
898 : use elpa_abstract_impl
899 : implicit none
900 : class(elpa_abstract_impl_t), intent(inout) :: obj
901 : integer(kind=ik), intent(in) :: na1, i
902 :
903 : real(kind=REAL_DATATYPE), intent(in) :: d1(:), dbase(:), ddiff(:), z(:)
904 : real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value
905 9224208 : real(kind=REAL_DATATYPE) :: tmp(1:na1)
906 :
907 : ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
908 : ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
909 :
910 : ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
911 : ! in exactly this order, but we want to prevent compiler optimization
912 :
913 4612104 : tmp(1:na1) = d1(1:na1) -dbase(i)
914 : call v_add_s_&
915 : &PRECISION&
916 4612104 : &(obj, tmp(1:na1),na1,ddiff(i))
917 4612104 : tmp(1:na1) = z(1:na1) / tmp(1:na1)
918 4612104 : ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
919 :
920 : end subroutine add_tmp_&
921 4612104 : &PRECISION
922 :
923 : subroutine resort_ev_&
924 0 : &PRECISION&
925 0 : &(obj, idx_ev, nLength)
926 : use precision
927 : use elpa_abstract_impl
928 : implicit none
929 : class(elpa_abstract_impl_t), intent(inout) :: obj
930 : integer(kind=ik), intent(in) :: nLength
931 : integer(kind=ik) :: idx_ev(nLength)
932 : integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out
933 :
934 0 : real(kind=REAL_DATATYPE), allocatable :: qtmp(:,:)
935 : integer(kind=ik) :: istat
936 : character(200) :: errorMessage
937 :
938 0 : if (l_rows==0) return ! My processor column has no work to do
939 :
940 : ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i))
941 :
942 0 : l_cols_out = COUNT(p_col_out(1:na)==my_pcol)
943 0 : allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage)
944 0 : if (istat .ne. 0) then
945 0 : print *,"resort_ev: error when allocating qtmp "//errorMessage
946 0 : stop 1
947 : endif
948 :
949 0 : nc = 0
950 :
951 0 : do i=1,na
952 :
953 0 : pc1 = p_col(idx_ev(i))
954 0 : lc1 = l_col(idx_ev(i))
955 0 : pc2 = p_col_out(i)
956 :
957 0 : if (pc2<0) cycle ! This column is not needed in output
958 :
959 0 : if (pc2==my_pcol) nc = nc+1 ! Counter for output columns
960 :
961 0 : if (pc1==my_pcol) then
962 0 : if (pc2==my_pcol) then
963 : ! send and recieve column are local
964 0 : qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
965 : else
966 : #ifdef WITH_MPI
967 0 : call obj%timer%start("mpi_communication")
968 0 : call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, mod(i,4096), mpi_comm_cols, mpierr)
969 0 : call obj%timer%stop("mpi_communication")
970 : #endif /* WITH_MPI */
971 : endif
972 0 : else if (pc2==my_pcol) then
973 : #ifdef WITH_MPI
974 0 : call obj%timer%start("mpi_communication")
975 0 : call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL_PRECISION, pc1, mod(i,4096), mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
976 0 : call obj%timer%stop("mpi_communication")
977 : #else /* WITH_MPI */
978 0 : qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
979 : #endif /* WITH_MPI */
980 : endif
981 : enddo
982 :
983 : ! Insert qtmp into (output) q
984 :
985 0 : nc = 0
986 :
987 0 : do i=1,na
988 :
989 0 : pc2 = p_col_out(i)
990 0 : lc2 = l_col_out(i)
991 :
992 0 : if (pc2==my_pcol) then
993 0 : nc = nc+1
994 0 : q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc)
995 : endif
996 : enddo
997 :
998 0 : deallocate(qtmp, stat=istat, errmsg=errorMessage)
999 0 : if (istat .ne. 0) then
1000 0 : print *,"resort_ev: error when deallocating qtmp "//errorMessage
1001 0 : stop 1
1002 : endif
1003 : end subroutine resort_ev_&
1004 0 : &PRECISION
1005 :
1006 : subroutine transform_columns_&
1007 45024 : &PRECISION&
1008 : &(obj, col1, col2)
1009 : use precision
1010 : use elpa_abstract_impl
1011 : implicit none
1012 : class(elpa_abstract_impl_t), intent(inout) :: obj
1013 :
1014 : integer(kind=ik) :: col1, col2
1015 : integer(kind=ik) :: pc1, pc2, lc1, lc2
1016 :
1017 45024 : if (l_rows==0) return ! My processor column has no work to do
1018 :
1019 45024 : pc1 = p_col(col1)
1020 45024 : lc1 = l_col(col1)
1021 45024 : pc2 = p_col(col2)
1022 45024 : lc2 = l_col(col2)
1023 :
1024 45024 : if (pc1==my_pcol) then
1025 45024 : if (pc2==my_pcol) then
1026 : ! both columns are local
1027 45024 : tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1)
1028 45024 : q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
1029 45024 : q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
1030 : else
1031 : #ifdef WITH_MPI
1032 0 : call obj%timer%start("mpi_communication")
1033 : call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, 1, &
1034 : tmp, l_rows, MPI_REAL_PRECISION, pc2, 1, &
1035 0 : mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
1036 0 : call obj%timer%stop("mpi_communication")
1037 : #else /* WITH_MPI */
1038 0 : tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
1039 : #endif /* WITH_MPI */
1040 0 : q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
1041 : endif
1042 0 : else if (pc2==my_pcol) then
1043 : #ifdef WITH_MPI
1044 0 : call obj%timer%start("mpi_communication")
1045 : call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL_PRECISION, pc1, 1, &
1046 : tmp, l_rows, MPI_REAL_PRECISION, pc1, 1, &
1047 0 : mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
1048 0 : call obj%timer%stop("mpi_communication")
1049 : #else /* WITH_MPI */
1050 0 : tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
1051 : #endif /* WITH_MPI */
1052 :
1053 0 : q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
1054 : endif
1055 : end subroutine transform_columns_&
1056 : &PRECISION
1057 :
1058 : subroutine global_gather_&
1059 158016 : &PRECISION&
1060 158016 : &(obj, z, n)
1061 : ! This routine sums up z over all processors.
1062 : ! It should only be used for gathering distributed results,
1063 : ! i.e. z(i) should be nonzero on exactly 1 processor column,
1064 : ! otherways the results may be numerically different on different columns
1065 : use precision
1066 : use elpa_abstract_impl
1067 : implicit none
1068 : class(elpa_abstract_impl_t), intent(inout) :: obj
1069 : integer(kind=ik) :: n
1070 : real(kind=REAL_DATATYPE) :: z(n)
1071 316032 : real(kind=REAL_DATATYPE) :: tmp(n)
1072 :
1073 158016 : if (npc_n==1 .and. np_rows==1) return ! nothing to do
1074 :
1075 : ! Do an mpi_allreduce over processor rows
1076 : #ifdef WITH_MPI
1077 153024 : call obj%timer%start("mpi_communication")
1078 153024 : call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1079 153024 : call obj%timer%stop("mpi_communication")
1080 : #else /* WITH_MPI */
1081 0 : tmp = z
1082 : #endif /* WITH_MPI */
1083 : ! If only 1 processor column, we are done
1084 153024 : if (npc_n==1) then
1085 153024 : z(:) = tmp(:)
1086 153024 : return
1087 : endif
1088 :
1089 : ! If all processor columns are involved, we can use mpi_allreduce
1090 0 : if (npc_n==np_cols) then
1091 : #ifdef WITH_MPI
1092 0 : call obj%timer%start("mpi_communication")
1093 0 : call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
1094 0 : call obj%timer%stop("mpi_communication")
1095 : #else /* WITH_MPI */
1096 0 : tmp = z
1097 : #endif /* WITH_MPI */
1098 :
1099 0 : return
1100 : endif
1101 :
1102 : ! Do a ring send over processor columns
1103 0 : z(:) = 0
1104 0 : do np = 1, npc_n
1105 0 : z(:) = z(:) + tmp(:)
1106 : #ifdef WITH_MPI
1107 0 : call obj%timer%start("mpi_communication")
1108 : call MPI_Sendrecv_replace(z, n, MPI_REAL_PRECISION, np_next, 1111, np_prev, 1111, &
1109 0 : mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
1110 0 : call obj%timer%stop("mpi_communication")
1111 : #endif /* WITH_MPI */
1112 : enddo
1113 : end subroutine global_gather_&
1114 0 : &PRECISION
1115 :
1116 : subroutine global_product_&
1117 39504 : &PRECISION&
1118 39504 : &(obj, z, n)
1119 : ! This routine calculates the global product of z.
1120 : use precision
1121 : use elpa_abstract_impl
1122 : implicit none
1123 : class(elpa_abstract_impl_t), intent(inout) :: obj
1124 :
1125 :
1126 : integer(kind=ik) :: n
1127 : real(kind=REAL_DATATYPE) :: z(n)
1128 :
1129 79008 : real(kind=REAL_DATATYPE) :: tmp(n)
1130 :
1131 39504 : if (npc_n==1 .and. np_rows==1) return ! nothing to do
1132 :
1133 : ! Do an mpi_allreduce over processor rows
1134 : #ifdef WITH_MPI
1135 38256 : call obj%timer%start("mpi_communication")
1136 38256 : call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_rows, mpierr)
1137 38256 : call obj%timer%stop("mpi_communication")
1138 : #else /* WITH_MPI */
1139 0 : tmp = z
1140 : #endif /* WITH_MPI */
1141 : ! If only 1 processor column, we are done
1142 38256 : if (npc_n==1) then
1143 38256 : z(:) = tmp(:)
1144 38256 : return
1145 : endif
1146 :
1147 : ! If all processor columns are involved, we can use mpi_allreduce
1148 0 : if (npc_n==np_cols) then
1149 : #ifdef WITH_MPI
1150 0 : call obj%timer%start("mpi_communication")
1151 0 : call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_cols, mpierr)
1152 0 : call obj%timer%stop("mpi_communication")
1153 : #else /* WITH_MPI */
1154 0 : z = tmp
1155 : #endif /* WITH_MPI */
1156 0 : return
1157 : endif
1158 :
1159 : ! We send all vectors to the first proc, do the product there
1160 : ! and redistribute the result.
1161 :
1162 0 : if (my_pcol == npc_0) then
1163 0 : z(1:n) = tmp(1:n)
1164 0 : do np = npc_0+1, npc_0+npc_n-1
1165 : #ifdef WITH_MPI
1166 0 : call obj%timer%start("mpi_communication")
1167 0 : call mpi_recv(tmp, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
1168 0 : call obj%timer%stop("mpi_communication")
1169 : #else /* WITH_MPI */
1170 0 : tmp(1:n) = z(1:n)
1171 : #endif /* WITH_MPI */
1172 0 : z(1:n) = z(1:n)*tmp(1:n)
1173 : enddo
1174 0 : do np = npc_0+1, npc_0+npc_n-1
1175 : #ifdef WITH_MPI
1176 0 : call obj%timer%start("mpi_communication")
1177 0 : call mpi_send(z, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, mpierr)
1178 0 : call obj%timer%stop("mpi_communication")
1179 : #endif /* WITH_MPI */
1180 : enddo
1181 : else
1182 : #ifdef WITH_MPI
1183 0 : call obj%timer%start("mpi_communication")
1184 0 : call mpi_send(tmp, n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, mpierr)
1185 0 : call mpi_recv(z ,n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
1186 0 : call obj%timer%stop("mpi_communication")
1187 : #else /* WITH_MPI */
1188 0 : z(1:n) = tmp(1:n)
1189 : #endif /* WITH_MPI */
1190 :
1191 : endif
1192 : end subroutine global_product_&
1193 0 : &PRECISION
1194 :
1195 : subroutine check_monotony_&
1196 197520 : &PRECISION&
1197 197520 : &(obj, n,d,text, wantDebug, success)
1198 : ! This is a test routine for checking if the eigenvalues are monotonically increasing.
1199 : ! It is for debug purposes only, an error should never be triggered!
1200 : use precision
1201 : use elpa_abstract_impl
1202 : implicit none
1203 :
1204 : class(elpa_abstract_impl_t), intent(inout) :: obj
1205 : integer(kind=ik) :: n
1206 : real(kind=REAL_DATATYPE) :: d(n)
1207 : character*(*) :: text
1208 :
1209 : integer(kind=ik) :: i
1210 : logical, intent(in) :: wantDebug
1211 : logical, intent(out) :: success
1212 :
1213 197520 : success = .true.
1214 27333552 : do i=1,n-1
1215 27136032 : if (d(i+1)<d(i)) then
1216 0 : if (wantDebug) write(error_unit,'(a,a,i8,2g25.17)') 'ELPA1_check_monotony: Monotony error on ',text,i,d(i),d(i+1)
1217 0 : success = .false.
1218 0 : return
1219 : endif
1220 : enddo
1221 : end subroutine check_monotony_&
1222 197520 : &PRECISION
1223 :
1224 : end subroutine merge_systems_&
1225 : &PRECISION
|