Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : !
47 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48 : !
49 : ! Copyright of the original code rests with the authors inside the ELPA
50 : ! consortium. The copyright of any additional modifications shall rest
51 : ! with their original authors, but shall adhere to the licensing terms
52 : ! distributed along with the original code in the file "COPYING".
53 : #endif
54 :
55 : #include "../general/sanity.F90"
56 :
57 : subroutine solve_tridi_&
58 57384 : &PRECISION_AND_SUFFIX &
59 57384 : ( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
60 : mpi_comm_cols, useGPU, wantDebug, success )
61 :
62 : use precision
63 : use elpa_abstract_impl
64 : implicit none
65 : #include "../../src/general/precision_kinds.F90"
66 : class(elpa_abstract_impl_t), intent(inout) :: obj
67 : integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
68 : real(kind=REAL_DATATYPE), intent(inout) :: d(na), e(na)
69 : #ifdef USE_ASSUMED_SIZE
70 : real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
71 : #else
72 : real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
73 : #endif
74 : logical, intent(in) :: useGPU, wantDebug
75 : logical, intent(out) :: success
76 :
77 : integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
78 : integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
79 :
80 57384 : integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
81 :
82 : integer(kind=ik) :: istat
83 : character(200) :: errorMessage
84 :
85 57384 : call obj%timer%start("solve_tridi" // PRECISION_SUFFIX)
86 :
87 57384 : call obj%timer%start("mpi_communication")
88 57384 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
89 57384 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
90 57384 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
91 57384 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
92 57384 : call obj%timer%stop("mpi_communication")
93 :
94 57384 : success = .true.
95 :
96 57384 : l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
97 57384 : l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
98 :
99 : ! Set Q to 0
100 57384 : q(1:l_rows, 1:l_cols) = 0.0_rk
101 :
102 : ! Get the limits of the subdivisons, each subdivison has as many cols
103 : ! as fit on the respective processor column
104 :
105 57384 : allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
106 57384 : if (istat .ne. 0) then
107 0 : print *,"solve_tridi: error when allocating limits "//errorMessage
108 0 : stop 1
109 : endif
110 :
111 57384 : limits(0) = 0
112 114768 : do np=0,np_cols-1
113 57384 : nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
114 :
115 : ! Check for the case that a column has have zero width.
116 : ! This is not supported!
117 : ! Scalapack supports it but delivers no results for these columns,
118 : ! which is rather annoying
119 57384 : if (nc==0) then
120 0 : call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
121 0 : if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
122 0 : success = .false.
123 0 : return
124 : endif
125 57384 : limits(np+1) = limits(np) + nc
126 : enddo
127 :
128 : ! Subdivide matrix by subtracting rank 1 modifications
129 :
130 57384 : do i=1,np_cols-1
131 0 : n = limits(i)
132 0 : d(n) = d(n)-abs(e(n))
133 0 : d(n+1) = d(n+1)-abs(e(n))
134 : enddo
135 :
136 : ! Solve sub problems on processsor columns
137 :
138 57384 : nc = limits(my_pcol) ! column after which my problem starts
139 :
140 57384 : if (np_cols>1) then
141 0 : nev1 = l_cols ! all eigenvectors are needed
142 : else
143 57384 : nev1 = MIN(nev,l_cols)
144 : endif
145 : call solve_tridi_col_&
146 : &PRECISION_AND_SUFFIX &
147 : (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
148 57384 : matrixCols, mpi_comm_rows, useGPU, wantDebug, success)
149 57384 : if (.not.(success)) then
150 0 : call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
151 0 : return
152 : endif
153 : ! If there is only 1 processor column, we are done
154 :
155 57384 : if (np_cols==1) then
156 57384 : deallocate(limits, stat=istat, errmsg=errorMessage)
157 57384 : if (istat .ne. 0) then
158 0 : print *,"solve_tridi: error when deallocating limits "//errorMessage
159 0 : stop 1
160 : endif
161 :
162 57384 : call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
163 57384 : return
164 : endif
165 :
166 : ! Set index arrays for Q columns
167 :
168 : ! Dense distribution scheme:
169 :
170 0 : allocate(l_col(na), stat=istat, errmsg=errorMessage)
171 0 : if (istat .ne. 0) then
172 0 : print *,"solve_tridi: error when allocating l_col "//errorMessage
173 0 : stop 1
174 : endif
175 :
176 0 : allocate(p_col(na), stat=istat, errmsg=errorMessage)
177 0 : if (istat .ne. 0) then
178 0 : print *,"solve_tridi: error when allocating p_col "//errorMessage
179 0 : stop 1
180 : endif
181 :
182 0 : n = 0
183 0 : do np=0,np_cols-1
184 0 : nc = local_index(na, np, np_cols, nblk, -1)
185 0 : do i=1,nc
186 0 : n = n+1
187 0 : l_col(n) = i
188 0 : p_col(n) = np
189 : enddo
190 : enddo
191 :
192 : ! Block cyclic distribution scheme, only nev columns are set:
193 :
194 0 : allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
195 0 : if (istat .ne. 0) then
196 0 : print *,"solve_tridi: error when allocating l_col_bc "//errorMessage
197 0 : stop 1
198 : endif
199 :
200 0 : allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
201 0 : if (istat .ne. 0) then
202 0 : print *,"solve_tridi: error when allocating p_col_bc "//errorMessage
203 0 : stop 1
204 : endif
205 :
206 0 : p_col_bc(:) = -1
207 0 : l_col_bc(:) = -1
208 :
209 0 : do i = 0, na-1, nblk*np_cols
210 0 : do j = 0, np_cols-1
211 0 : do n = 1, nblk
212 0 : if (i+j*nblk+n <= MIN(nev,na)) then
213 0 : p_col_bc(i+j*nblk+n) = j
214 0 : l_col_bc(i+j*nblk+n) = i/np_cols + n
215 : endif
216 : enddo
217 : enddo
218 : enddo
219 :
220 : ! Recursively merge sub problems
221 : call merge_recursive_&
222 : &PRECISION &
223 0 : (obj, 0, np_cols, useGPU, wantDebug, success)
224 0 : if (.not.(success)) then
225 0 : call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
226 0 : return
227 : endif
228 :
229 0 : deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
230 0 : if (istat .ne. 0) then
231 0 : print *,"solve_tridi: error when deallocating l_col "//errorMessage
232 0 : stop 1
233 : endif
234 :
235 0 : call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
236 0 : return
237 :
238 : contains
239 : recursive subroutine merge_recursive_&
240 0 : &PRECISION &
241 : (obj, np_off, nprocs, useGPU, wantDebug, success)
242 : use precision
243 : use elpa_abstract_impl
244 : implicit none
245 :
246 : ! noff is always a multiple of nblk_ev
247 : ! nlen-noff is always > nblk_ev
248 :
249 : class(elpa_abstract_impl_t), intent(inout) :: obj
250 : integer(kind=ik) :: np_off, nprocs
251 : integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
252 : #ifdef WITH_MPI
253 : ! integer(kind=ik) :: my_mpi_status(mpi_status_size)
254 : #endif
255 : logical, intent(in) :: useGPU, wantDebug
256 : logical, intent(out) :: success
257 :
258 0 : success = .true.
259 :
260 0 : if (nprocs<=1) then
261 : ! Safety check only
262 0 : if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
263 0 : success = .false.
264 0 : return
265 : endif
266 : ! Split problem into 2 subproblems of size np1 / np2
267 :
268 0 : np1 = nprocs/2
269 0 : np2 = nprocs-np1
270 :
271 0 : if (np1 > 1) call merge_recursive_&
272 : &PRECISION &
273 0 : (obj, np_off, np1, useGPU, wantDebug, success)
274 0 : if (.not.(success)) return
275 0 : if (np2 > 1) call merge_recursive_&
276 : &PRECISION &
277 0 : (obj, np_off+np1, np2, useGPU, wantDebug, success)
278 0 : if (.not.(success)) return
279 :
280 0 : noff = limits(np_off)
281 0 : nmid = limits(np_off+np1) - noff
282 0 : nlen = limits(np_off+nprocs) - noff
283 :
284 : #ifdef WITH_MPI
285 0 : call obj%timer%start("mpi_communication")
286 0 : if (my_pcol==np_off) then
287 0 : do n=np_off+np1,np_off+nprocs-1
288 0 : call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
289 : enddo
290 : endif
291 0 : call obj%timer%stop("mpi_communication")
292 : #endif /* WITH_MPI */
293 :
294 0 : if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
295 : #ifdef WITH_MPI
296 0 : call obj%timer%start("mpi_communication")
297 0 : call mpi_recv(d(noff+1), nmid, MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
298 0 : call obj%timer%stop("mpi_communication")
299 : #else /* WITH_MPI */
300 : ! d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
301 : #endif /* WITH_MPI */
302 : endif
303 :
304 0 : if (my_pcol==np_off+np1) then
305 0 : do n=np_off,np_off+np1-1
306 : #ifdef WITH_MPI
307 0 : call obj%timer%start("mpi_communication")
308 0 : call mpi_send(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
309 0 : call obj%timer%stop("mpi_communication")
310 : #endif /* WITH_MPI */
311 :
312 : enddo
313 : endif
314 0 : if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
315 : #ifdef WITH_MPI
316 0 : call obj%timer%start("mpi_communication")
317 0 : call mpi_recv(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
318 0 : call obj%timer%stop("mpi_communication")
319 : #else /* WITH_MPI */
320 : ! d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
321 : #endif /* WITH_MPI */
322 : endif
323 0 : if (nprocs == np_cols) then
324 :
325 : ! Last merge, result distribution must be block cyclic, noff==0,
326 : ! p_col_bc is set so that only nev eigenvalues are calculated
327 : call merge_systems_&
328 : &PRECISION &
329 : (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
330 : nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
331 0 : l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success )
332 0 : if (.not.(success)) return
333 : else
334 : ! Not last merge, leave dense column distribution
335 : call merge_systems_&
336 : &PRECISION &
337 : (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
338 : nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
339 0 : l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success )
340 0 : if (.not.(success)) return
341 : endif
342 : end subroutine merge_recursive_&
343 : &PRECISION
344 :
345 : end subroutine solve_tridi_&
346 : &PRECISION_AND_SUFFIX
347 :
348 : subroutine solve_tridi_col_&
349 57384 : &PRECISION_AND_SUFFIX &
350 57384 : ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success )
351 :
352 : ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
353 : ! with the divide and conquer method.
354 : ! Works best if the number of processor rows is a power of 2!
355 : use precision
356 : use elpa_abstract_impl
357 : implicit none
358 : class(elpa_abstract_impl_t), intent(inout) :: obj
359 :
360 : integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
361 : real(kind=REAL_DATATYPE) :: d(na), e(na)
362 : #ifdef USE_ASSUMED_SIZE
363 : real(kind=REAL_DATATYPE) :: q(ldq,*)
364 : #else
365 : real(kind=REAL_DATATYPE) :: q(ldq,matrixCols)
366 : #endif
367 :
368 : integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
369 :
370 57384 : real(kind=REAL_DATATYPE), allocatable :: qmat1(:,:), qmat2(:,:)
371 : integer(kind=ik) :: i, n, np
372 : integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
373 : integer(kind=ik) :: my_prow, np_rows, mpierr
374 :
375 57384 : integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
376 : logical, intent(in) :: useGPU, wantDebug
377 : logical, intent(out) :: success
378 : integer(kind=ik) :: istat
379 : character(200) :: errorMessage
380 :
381 57384 : call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
382 57384 : call obj%timer%start("mpi_communication")
383 57384 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
384 57384 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
385 57384 : call obj%timer%stop("mpi_communication")
386 57384 : success = .true.
387 : ! Calculate the number of subdivisions needed.
388 :
389 57384 : n = na
390 57384 : ndiv = 1
391 133896 : do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
392 38256 : n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
393 38256 : ndiv = ndiv*2
394 : enddo
395 :
396 : ! If there is only 1 processor row and not all eigenvectors are needed
397 : ! and the matrix size is big enough, then use 2 subdivisions
398 : ! so that merge_systems is called once and only the needed
399 : ! eigenvectors are calculated for the final problem.
400 :
401 57384 : if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
402 :
403 57384 : allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
404 57384 : if (istat .ne. 0) then
405 0 : print *,"solve_tridi_col: error when allocating limits "//errorMessage
406 0 : stop 1
407 : endif
408 :
409 57384 : limits(0) = 0
410 57384 : limits(ndiv) = na
411 :
412 57384 : n = ndiv
413 136392 : do while(n>1)
414 39504 : n = n/2 ! n is always a power of 2
415 79008 : do i=0,ndiv-1,2*n
416 : ! We want to have even boundaries (for cache line alignments)
417 39504 : limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
418 : enddo
419 : enddo
420 :
421 : ! Calculate the maximum size of a subproblem
422 :
423 57384 : max_size = 0
424 154272 : do i=1,ndiv
425 96888 : max_size = MAX(max_size,limits(i)-limits(i-1))
426 : enddo
427 :
428 : ! Subdivide matrix by subtracting rank 1 modifications
429 :
430 96888 : do i=1,ndiv-1
431 39504 : n = limits(i)
432 39504 : d(n) = d(n)-abs(e(n))
433 39504 : d(n+1) = d(n+1)-abs(e(n))
434 : enddo
435 :
436 57384 : if (np_rows==1) then
437 :
438 : ! For 1 processor row there may be 1 or 2 subdivisions
439 39504 : do n=0,ndiv-1
440 20376 : noff = limits(n) ! Start of subproblem
441 20376 : nlen = limits(n+1)-noff ! Size of subproblem
442 :
443 : call solve_tridi_single_problem_&
444 : &PRECISION_AND_SUFFIX &
445 : (obj, nlen,d(noff+1),e(noff+1), &
446 20376 : q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
447 :
448 20376 : if (.not.(success)) return
449 : enddo
450 :
451 : else
452 :
453 : ! Solve sub problems in parallel with solve_tridi_single
454 : ! There is at maximum 1 subproblem per processor
455 :
456 38256 : allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
457 38256 : if (istat .ne. 0) then
458 0 : print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage
459 0 : stop 1
460 : endif
461 :
462 38256 : allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
463 38256 : if (istat .ne. 0) then
464 0 : print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage
465 0 : stop 1
466 : endif
467 :
468 38256 : qmat1 = 0 ! Make sure that all elements are defined
469 :
470 38256 : if (my_prow < ndiv) then
471 :
472 38256 : noff = limits(my_prow) ! Start of subproblem
473 38256 : nlen = limits(my_prow+1)-noff ! Size of subproblem
474 : call solve_tridi_single_problem_&
475 : &PRECISION_AND_SUFFIX &
476 : (obj, nlen,d(noff+1),e(noff+1),qmat1, &
477 38256 : ubound(qmat1,dim=1), wantDebug, success)
478 :
479 38256 : if (.not.(success)) return
480 : endif
481 :
482 : ! Fill eigenvectors in qmat1 into global matrix q
483 :
484 114768 : do np = 0, ndiv-1
485 :
486 76512 : noff = limits(np)
487 76512 : nlen = limits(np+1)-noff
488 : #ifdef WITH_MPI
489 76512 : call obj%timer%start("mpi_communication")
490 76512 : call MPI_Bcast(d(noff+1), nlen, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
491 76512 : qmat2 = qmat1
492 76512 : call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
493 76512 : call obj%timer%stop("mpi_communication")
494 : #else /* WITH_MPI */
495 : ! qmat2 = qmat1 ! is this correct
496 : #endif /* WITH_MPI */
497 7936512 : do i=1,nlen
498 :
499 : #ifdef WITH_MPI
500 : call distribute_global_column_&
501 : &PRECISION &
502 7860000 : (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
503 : #else /* WITH_MPI */
504 : call distribute_global_column_&
505 : &PRECISION &
506 0 : (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
507 : #endif /* WITH_MPI */
508 : enddo
509 :
510 : enddo
511 :
512 38256 : deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
513 38256 : if (istat .ne. 0) then
514 0 : print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage
515 0 : stop 1
516 : endif
517 :
518 : endif
519 :
520 : ! Allocate and set index arrays l_col and p_col
521 :
522 57384 : allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage)
523 57384 : if (istat .ne. 0) then
524 0 : print *,"solve_tridi_col: error when allocating l_col "//errorMessage
525 0 : stop 1
526 : endif
527 :
528 11847384 : do i=1,na
529 11790000 : l_col(i) = i
530 11790000 : p_col_i(i) = 0
531 11790000 : p_col_o(i) = 0
532 : enddo
533 :
534 : ! Merge subproblems
535 :
536 57384 : n = 1
537 136392 : do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
538 :
539 79008 : do i=0,ndiv-1,2*n
540 :
541 39504 : noff = limits(i)
542 39504 : nmid = limits(i+n) - noff
543 39504 : nlen = limits(i+2*n) - noff
544 :
545 39504 : if (nlen == na) then
546 : ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
547 39504 : p_col_o(nev+1:na) = -1
548 : endif
549 : call merge_systems_&
550 : &PRECISION &
551 : (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
552 : matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
553 39504 : l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success)
554 39504 : if (.not.(success)) return
555 :
556 : enddo
557 :
558 39504 : n = 2*n
559 :
560 : enddo
561 :
562 57384 : deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
563 57384 : if (istat .ne. 0) then
564 0 : print *,"solve_tridi_col: error when deallocating l_col "//errorMessage
565 0 : stop 1
566 : endif
567 :
568 57384 : call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
569 :
570 : end subroutine solve_tridi_col_&
571 57384 : &PRECISION_AND_SUFFIX
572 :
573 : recursive subroutine solve_tridi_single_problem_&
574 58632 : &PRECISION_AND_SUFFIX &
575 58632 : (obj, nlen, d, e, q, ldq, wantDebug, success)
576 :
577 : ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
578 : ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
579 : use precision
580 : use elpa_abstract_impl
581 : implicit none
582 : class(elpa_abstract_impl_t), intent(inout) :: obj
583 : integer(kind=ik) :: nlen, ldq
584 : real(kind=REAL_DATATYPE) :: d(nlen), e(nlen), q(ldq,nlen)
585 :
586 117264 : real(kind=REAL_DATATYPE), allocatable :: work(:), qtmp(:), ds(:), es(:)
587 : real(kind=REAL_DATATYPE) :: dtmp
588 :
589 : integer(kind=ik) :: i, j, lwork, liwork, info
590 58632 : integer(kind=ik), allocatable :: iwork(:)
591 :
592 : logical, intent(in) :: wantDebug
593 : logical, intent(out) :: success
594 : integer(kind=ik) :: istat
595 : character(200) :: errorMessage
596 :
597 58632 : call obj%timer%start("solve_tridi_single" // PRECISION_SUFFIX)
598 :
599 58632 : success = .true.
600 58632 : allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
601 58632 : if (istat .ne. 0) then
602 0 : print *,"solve_tridi_single: error when allocating ds "//errorMessage
603 0 : stop 1
604 : endif
605 :
606 : ! Save d and e for the case that dstedc fails
607 :
608 58632 : ds(:) = d(:)
609 58632 : es(:) = e(:)
610 :
611 : ! First try dstedc, this is normally faster but it may fail sometimes (why???)
612 :
613 58632 : lwork = 1 + 4*nlen + nlen**2
614 58632 : liwork = 3 + 5*nlen
615 58632 : allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
616 58632 : if (istat .ne. 0) then
617 0 : print *,"solve_tridi_single: error when allocating work "//errorMessage
618 0 : stop 1
619 : endif
620 58632 : call obj%timer%start("blas")
621 58632 : call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
622 58632 : call obj%timer%stop("blas")
623 :
624 58632 : if (info /= 0) then
625 :
626 : ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
627 :
628 0 : write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
629 :
630 0 : d(:) = ds(:)
631 0 : e(:) = es(:)
632 0 : call obj%timer%start("blas")
633 0 : call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
634 0 : call obj%timer%stop("blas")
635 :
636 : ! If DSTEQR fails also, we don't know what to do further ...
637 :
638 0 : if (info /= 0) then
639 0 : if (wantDebug) &
640 0 : write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
641 0 : success = .false.
642 0 : return
643 : endif
644 : end if
645 :
646 58632 : deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
647 58632 : if (istat .ne. 0) then
648 0 : print *,"solve_tridi_single: error when deallocating ds "//errorMessage
649 0 : stop 1
650 : endif
651 :
652 : ! Check if eigenvalues are monotonically increasing
653 : ! This seems to be not always the case (in the IBM implementation of dstedc ???)
654 :
655 7860000 : do i=1,nlen-1
656 7801368 : if (d(i+1)<d(i)) then
657 : #ifdef DOUBLE_PRECISION_REAL
658 0 : if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
659 : #else
660 0 : if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
661 : #endif
662 0 : write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
663 : else
664 0 : write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
665 0 : write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
666 0 : write(error_unit,'(a)') 'In this extent, this is completely harmless.'
667 0 : write(error_unit,'(a)') 'Still, we keep this info message just in case.'
668 : end if
669 0 : allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
670 0 : if (istat .ne. 0) then
671 0 : print *,"solve_tridi_single: error when allocating qtmp "//errorMessage
672 0 : stop 1
673 : endif
674 :
675 0 : dtmp = d(i+1)
676 0 : qtmp(1:nlen) = q(1:nlen,i+1)
677 0 : do j=i,1,-1
678 0 : if (dtmp<d(j)) then
679 0 : d(j+1) = d(j)
680 0 : q(1:nlen,j+1) = q(1:nlen,j)
681 : else
682 0 : exit ! Loop
683 : endif
684 : enddo
685 0 : d(j+1) = dtmp
686 0 : q(1:nlen,j+1) = qtmp(1:nlen)
687 0 : deallocate(qtmp, stat=istat, errmsg=errorMessage)
688 0 : if (istat .ne. 0) then
689 0 : print *,"solve_tridi_single: error when deallocating qtmp "//errorMessage
690 0 : stop 1
691 : endif
692 :
693 : endif
694 : enddo
695 58632 : call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
696 :
697 : end subroutine solve_tridi_single_problem_&
698 58632 : &PRECISION_AND_SUFFIX
699 :
|