Line data Source code
1 : ! This file is part of ELPA.
2 : !
3 : ! The ELPA library was originally created by the ELPA consortium,
4 : ! consisting of the following organizations:
5 : !
6 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : ! This particular source code file contains additions, changes and
19 : ! enhancements authored by Intel Corporation which is not part of
20 : ! the ELPA consortium.
21 : !
22 : ! More information can be found here:
23 : ! http://elpa.mpcdf.mpg.de/
24 : !
25 : ! ELPA is free software: you can redistribute it and/or modify
26 : ! it under the terms of the version 3 of the license of the
27 : ! GNU Lesser General Public License as published by the Free
28 : ! Software Foundation.
29 : !
30 : ! ELPA is distributed in the hope that it will be useful,
31 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : ! GNU Lesser General Public License for more details.
34 : !
35 : ! You should have received a copy of the GNU Lesser General Public License
36 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : !
38 : ! ELPA reflects a substantial effort on the part of the original
39 : ! ELPA consortium, and we ask you to respect the spirit of the
40 : ! license that we chose: i.e., please contribute any changes you
41 : ! may have back to the original ELPA library distribution, and keep
42 : ! any derivatives of ELPA under the same license that we chose for
43 : ! the original distribution, the GNU Lesser General Public License.
44 : !
45 : !
46 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
47 : !
48 : ! Copyright of the original code rests with the authors inside the ELPA
49 : ! consortium. The copyright of any additional modifications shall rest
50 : ! with their original authors, but shall adhere to the licensing terms
51 : ! distributed along with the original code in the file "COPYING".
52 47448 : function elpa_solve_evp_&
53 : &MATH_DATATYPE&
54 : &_&
55 : &2stage_&
56 : &PRECISION&
57 46296 : &_impl (obj, a, ev, q) result(success)
58 :
59 : use elpa_abstract_impl
60 : use elpa_utilities
61 : use elpa1_compute
62 : use elpa2_compute
63 : use elpa_mpi
64 : use cuda_functions
65 : use mod_check_for_gpu
66 : use iso_c_binding
67 : implicit none
68 : #include "../general/precision_kinds.F90"
69 : class(elpa_abstract_impl_t), intent(inout) :: obj
70 : logical :: useGPU
71 : #if REALCASE == 1
72 : logical :: useQR
73 : logical :: useQRActual
74 : #endif
75 : integer(kind=c_int) :: kernel
76 :
77 : #ifdef USE_ASSUMED_SIZE
78 : MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
79 : MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
80 : #else
81 : MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
82 : MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
83 : #endif
84 : real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
85 47448 : MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
86 :
87 : integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
88 : integer(kind=c_int) :: nbw, num_blocks
89 : #if COMPLEXCASE == 1
90 : integer(kind=c_int) :: l_cols_nev, l_rows, l_cols
91 : #endif
92 47448 : MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
93 47448 : real(kind=C_DATATYPE_KIND), allocatable :: e(:)
94 : #if COMPLEXCASE == 1
95 18864 : real(kind=C_DATATYPE_KIND), allocatable :: q_real(:,:)
96 : #endif
97 47448 : MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target :: q_dummy(:,:)
98 : MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
99 :
100 :
101 : integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
102 :
103 : integer(kind=c_int) :: i
104 : logical :: success, successCUDA
105 : logical :: wantDebug
106 : integer(kind=c_int) :: istat, gpu, debug, qr
107 : character(200) :: errorMessage
108 : logical :: do_useGPU, do_useGPU_trans_ev_tridi
109 : integer(kind=c_int) :: numberOfGPUDevices
110 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
111 : &PRECISION&
112 : &_&
113 : &MATH_DATATYPE
114 : integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, &
115 : mpi_comm_rows, mpi_comm_cols, &
116 : mpi_comm_all, check_pd, error
117 :
118 : logical :: do_bandred, do_tridiag, do_solve_tridi, &
119 : do_trans_to_band, do_trans_to_full
120 :
121 : call obj%timer%start("elpa_solve_evp_&
122 : &MATH_DATATYPE&
123 : &_2stage_&
124 : &PRECISION&
125 47448 : &")
126 :
127 47448 : success = .true.
128 :
129 47448 : if (present(q)) then
130 46296 : obj%eigenvalues_only = .false.
131 : else
132 1152 : obj%eigenvalues_only = .true.
133 : endif
134 :
135 47448 : na = obj%na
136 47448 : nev = obj%nev
137 47448 : lda = obj%local_nrows
138 47448 : ldq = obj%local_nrows
139 47448 : nblk = obj%nblk
140 47448 : matrixCols = obj%local_ncols
141 :
142 : ! special case na = 1
143 47448 : if (na .eq. 1) then
144 : #if REALCASE == 1
145 0 : ev(1) = a(1,1)
146 : #endif
147 : #if COMPLEXCASE == 1
148 0 : ev(1) = real(a(1,1))
149 : #endif
150 0 : if (.not.(obj%eigenvalues_only)) then
151 0 : q(1,1) = ONE
152 : endif
153 : call obj%timer%stop("elpa_solve_evp_&
154 : &MATH_DATATYPE&
155 : &_2stage_&
156 : &PRECISION&
157 0 : &")
158 0 : return
159 : endif
160 :
161 47448 : if (nev == 0) then
162 0 : nev = 1
163 0 : obj%eigenvalues_only = .true.
164 : endif
165 :
166 : #if REALCASE == 1
167 28584 : call obj%get("real_kernel",kernel,error)
168 28584 : if (error .ne. ELPA_OK) then
169 0 : print *,"Problem getting option. Aborting..."
170 0 : stop
171 : endif
172 : ! check consistency between request for GPUs and defined kernel
173 28584 : call obj%get("gpu", gpu,error)
174 28584 : if (error .ne. ELPA_OK) then
175 0 : print *,"Problem getting option. Aborting..."
176 0 : stop
177 : endif
178 28584 : if (gpu == 1) then
179 0 : if (kernel .ne. ELPA_2STAGE_REAL_GPU) then
180 0 : write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
181 0 : write(error_unit,*) "The compute kernel will be executed on CPUs!"
182 0 : else if (nblk .ne. 128) then
183 0 : kernel = ELPA_2STAGE_REAL_GENERIC
184 : endif
185 : endif
186 28584 : if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
187 0 : if (gpu .ne. 1) then
188 0 : write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
189 : endif
190 : endif
191 :
192 : #ifdef SINGLE_PRECISION_REAL
193 : ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
194 : if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
195 7848 : kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
196 : kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6 ) then
197 0 : write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
198 0 : write(error_unit,*) "The GENERIC kernel will be used at the moment"
199 0 : kernel = ELPA_2STAGE_REAL_GENERIC
200 : endif
201 : ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
202 : if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
203 7848 : kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
204 : kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6 ) then
205 0 : write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
206 0 : write(error_unit,*) "The GENERIC kernel will be used at the moment"
207 0 : kernel = ELPA_2STAGE_REAL_GENERIC
208 : endif
209 : #endif
210 :
211 : #endif
212 :
213 : #if COMPLEXCASE == 1
214 18864 : call obj%get("complex_kernel",kernel,error)
215 18864 : if (error .ne. ELPA_OK) then
216 0 : print *,"Problem getting option. Aborting..."
217 0 : stop
218 : endif
219 : ! check consistency between request for GPUs and defined kernel
220 18864 : call obj%get("gpu", gpu,error)
221 18864 : if (error .ne. ELPA_OK) then
222 0 : print *,"Problem getting option. Aborting..."
223 0 : stop
224 : endif
225 18864 : if (gpu == 1) then
226 0 : if (kernel .ne. ELPA_2STAGE_COMPLEX_GPU) then
227 0 : write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
228 0 : write(error_unit,*) "The compute kernel will be executed on CPUs!"
229 0 : else if (nblk .ne. 128) then
230 0 : kernel = ELPA_2STAGE_COMPLEX_GENERIC
231 : endif
232 : endif
233 18864 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
234 0 : if (gpu .ne. 1) then
235 0 : write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
236 : endif
237 : endif
238 :
239 : #endif
240 47448 : call obj%get("mpi_comm_rows",mpi_comm_rows,error)
241 47448 : if (error .ne. ELPA_OK) then
242 0 : print *,"Problem getting option. Aborting..."
243 0 : stop
244 : endif
245 47448 : call obj%get("mpi_comm_cols",mpi_comm_cols,error)
246 47448 : if (error .ne. ELPA_OK) then
247 0 : print *,"Problem getting option. Aborting..."
248 0 : stop
249 : endif
250 47448 : call obj%get("mpi_comm_parent",mpi_comm_all,error)
251 47448 : if (error .ne. ELPA_OK) then
252 0 : print *,"Problem getting option. Aborting..."
253 0 : stop
254 : endif
255 :
256 47448 : if (gpu .eq. 1) then
257 0 : useGPU = .true.
258 : else
259 47448 : useGPU = .false.
260 : endif
261 :
262 : #if REALCASE == 1
263 28584 : call obj%get("qr",qr,error)
264 28584 : if (error .ne. ELPA_OK) then
265 0 : print *,"Problem getting option. Aborting..."
266 0 : stop
267 : endif
268 28584 : if (qr .eq. 1) then
269 0 : useQR = .true.
270 : else
271 28584 : useQR = .false.
272 : endif
273 :
274 : #endif
275 47448 : call obj%timer%start("mpi_communication")
276 47448 : call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
277 47448 : call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
278 :
279 47448 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
280 47448 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
281 47448 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
282 47448 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
283 47448 : call obj%timer%stop("mpi_communication")
284 :
285 47448 : call obj%get("debug",debug,error)
286 47448 : if (error .ne. ELPA_OK) then
287 0 : print *,"Problem getting option. Aborting..."
288 0 : stop
289 : endif
290 47448 : wantDebug = debug == 1
291 :
292 47448 : do_useGPU = .false.
293 47448 : do_useGPU_trans_ev_tridi =.false.
294 :
295 :
296 : #if REALCASE == 1
297 28584 : useQRActual = .false.
298 : ! set usage of qr decomposition via API call
299 28584 : if (useQR) useQRActual = .true.
300 28584 : if (.not.(useQR)) useQRACtual = .false.
301 :
302 28584 : if (useQRActual) then
303 0 : if (mod(na,2) .ne. 0) then
304 0 : if (wantDebug) then
305 0 : write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
306 : endif
307 0 : print *, "Do not use QR-decomposition for this matrix and blocksize."
308 0 : success = .false.
309 0 : return
310 : endif
311 : endif
312 : #endif /* REALCASE */
313 :
314 47448 : if (useGPU) then
315 0 : if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
316 :
317 0 : do_useGPU = .true.
318 :
319 : ! set the neccessary parameters
320 0 : cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
321 0 : cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
322 0 : cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
323 0 : cudaHostRegisterPortable = cuda_hostRegisterPortable()
324 0 : cudaHostRegisterMapped = cuda_hostRegisterMapped()
325 : else
326 0 : print *,"GPUs are requested but not detected! Aborting..."
327 0 : success = .false.
328 0 : return
329 : endif
330 : else
331 : ! check whether set by environment variable
332 47448 : call obj%get("gpu",gpu,error)
333 47448 : if (error .ne. ELPA_OK) then
334 0 : print *,"Problem getting option. Aborting..."
335 0 : stop
336 : endif
337 47448 : do_useGPU = gpu == 1
338 47448 : if (do_useGPU) then
339 0 : if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
340 :
341 : ! set the neccessary parameters
342 0 : cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
343 0 : cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
344 0 : cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
345 0 : cudaHostRegisterPortable = cuda_hostRegisterPortable()
346 0 : cudaHostRegisterMapped = cuda_hostRegisterMapped()
347 : else
348 0 : print *,"GPUs are requested but not detected! Aborting..."
349 0 : success = .false.
350 0 : return
351 : endif
352 : endif
353 : endif
354 :
355 : ! check consistency between request for GPUs and defined kernel
356 47448 : if (do_useGPU) then
357 0 : if (nblk .ne. 128) then
358 : ! cannot run on GPU with this blocksize
359 : ! disable GPU usage for trans_ev_tridi
360 0 : do_useGPU_trans_ev_tridi = .false.
361 : else
362 : #if REALCASE == 1
363 0 : if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
364 : #endif
365 : #if COMPLEXCASE == 1
366 0 : if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
367 : #endif
368 0 : do_useGPU_trans_ev_tridi = .true.
369 : else
370 0 : do_useGPU_trans_ev_tridi = .false.
371 : endif
372 : endif
373 : endif
374 :
375 :
376 :
377 47448 : if (.not. obj%eigenvalues_only) then
378 46296 : q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
379 : else
380 1152 : allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
381 1152 : q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
382 : endif
383 :
384 :
385 : ! set the default values for each of the 5 compute steps
386 47448 : do_bandred = .true.
387 47448 : do_tridiag = .true.
388 47448 : do_solve_tridi = .true.
389 47448 : do_trans_to_band = .true.
390 47448 : do_trans_to_full = .true.
391 :
392 47448 : if (obj%eigenvalues_only) then
393 1152 : do_trans_to_band = .false.
394 1152 : do_trans_to_full = .false.
395 : endif
396 :
397 47448 : if (obj%is_set("bandwidth") == 1) then
398 864 : call obj%get("bandwidth",nbw,error)
399 864 : if (nbw == 0) then
400 0 : if (wantDebug) then
401 0 : write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
402 0 : "for a diagonal matrix! This is too simple"
403 : endif
404 0 : print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
405 0 : "for a diagonal matrix! This is too simple"
406 0 : success = .false.
407 0 : return
408 : endif
409 864 : if (mod(nbw, nblk) .ne. 0) then
410 : ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
411 : ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
412 0 : nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))
413 :
414 : ! just check that effective bandwidth is NOT larger than matrix size
415 0 : if (nbw .gt. na) then
416 0 : if (wantDebug) then
417 0 : write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
418 0 : "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
419 0 : "solve your problem by not specifing a bandwidth"
420 : endif
421 0 : print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
422 0 : "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
423 0 : "solve your problem by not specifing a bandwidth"
424 0 : success = .false.
425 0 : return
426 : endif
427 : endif
428 864 : do_bandred = .false. ! we already have a banded matrix
429 864 : do_solve_tridi = .true. ! we also have to solve something :-)
430 864 : do_trans_to_band = .true. ! and still we have to backsub to banded
431 864 : do_trans_to_full = .false. ! but not to full since we have a banded matrix
432 : else ! bandwidth is not set
433 :
434 : ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
435 : ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
436 : ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
437 : ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
438 : ! on this and maybe allow a run-time optimization here
439 46584 : if (do_useGPU) then
440 0 : nbw = nblk
441 : else
442 : #if REALCASE == 1
443 28152 : nbw = (63/nblk+1)*nblk
444 : #elif COMPLEXCASE == 1
445 18432 : nbw = (31/nblk+1)*nblk
446 : #endif
447 : endif
448 :
449 46584 : num_blocks = (na-1)/nbw + 1
450 :
451 46584 : allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
452 46584 : if (istat .ne. 0) then
453 : print *,"solve_evp_&
454 : &MATH_DATATYPE&
455 : &_2stage_&
456 : &PRECISION&
457 0 : &" // ": error when allocating tmat "//errorMessage
458 0 : stop 1
459 : endif
460 :
461 46584 : do_bandred = .true.
462 46584 : do_solve_tridi = .true.
463 46584 : do_trans_to_band = .true.
464 46584 : do_trans_to_full = .true.
465 : end if ! matrix not already banded on input
466 :
467 : ! start the computations in 5 steps
468 :
469 47448 : if (do_bandred) then
470 46584 : call obj%timer%start("bandred")
471 : ! Reduction full -> band
472 : call bandred_&
473 : &MATH_DATATYPE&
474 : &_&
475 : &PRECISION &
476 : (obj, na, a, &
477 : a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
478 : tmat_dev, wantDebug, do_useGPU, success &
479 : #if REALCASE == 1
480 : , useQRActual &
481 : #endif
482 46584 : )
483 46584 : call obj%timer%stop("bandred")
484 46584 : if (.not.(success)) return
485 : endif
486 :
487 :
488 : ! Reduction band -> tridiagonal
489 47448 : if (do_tridiag) then
490 47448 : allocate(e(na), stat=istat, errmsg=errorMessage)
491 47448 : if (istat .ne. 0) then
492 : print *,"solve_evp_&
493 : &MATH_DATATYPE&
494 : &_2stage_&
495 0 : &PRECISION " // ": error when allocating e "//errorMessage
496 0 : stop 1
497 : endif
498 :
499 47448 : call obj%timer%start("tridiag")
500 : call tridiag_band_&
501 : &MATH_DATATYPE&
502 : &_&
503 : &PRECISION&
504 : (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
505 47448 : do_useGPU, wantDebug)
506 :
507 : #ifdef WITH_MPI
508 31632 : call obj%timer%start("mpi_communication")
509 31632 : call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
510 31632 : call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
511 31632 : call obj%timer%stop("mpi_communication")
512 : #endif /* WITH_MPI */
513 79080 : call obj%timer%stop("tridiag")
514 : endif ! do_tridiag
515 :
516 : #if COMPLEXCASE == 1
517 18864 : l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
518 18864 : l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
519 18864 : l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
520 :
521 18864 : allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
522 18864 : if (istat .ne. 0) then
523 : print *,"solve_evp_&
524 : &MATH_DATATYPE&
525 0 : &_2stage: error when allocating q_real"//errorMessage
526 0 : stop 1
527 : endif
528 : #endif
529 :
530 : ! Solve tridiagonal system
531 47448 : if (do_solve_tridi) then
532 47448 : call obj%timer%start("solve")
533 : call solve_tridi_&
534 : &PRECISION &
535 : (obj, na, nev, ev, e, &
536 : #if REALCASE == 1
537 : q_actual, ldq, &
538 : #endif
539 : #if COMPLEXCASE == 1
540 : q_real, ubound(q_real,dim=1), &
541 : #endif
542 47448 : nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU, wantDebug, success)
543 47448 : call obj%timer%stop("solve")
544 47448 : if (.not.(success)) return
545 : endif ! do_solve_tridi
546 :
547 47448 : deallocate(e, stat=istat, errmsg=errorMessage)
548 47448 : if (istat .ne. 0) then
549 : print *,"solve_evp_&
550 : &MATH_DATATYPE&
551 0 : &_2stage: error when deallocating e "//errorMessage
552 0 : stop 1
553 : endif
554 :
555 47448 : if (obj%eigenvalues_only) then
556 1152 : do_trans_to_band = .false.
557 1152 : do_trans_to_full = .false.
558 : else
559 :
560 46296 : call obj%get("check_pd",check_pd,error)
561 46296 : if (error .ne. ELPA_OK) then
562 0 : print *,"Problem getting option. Aborting..."
563 0 : stop
564 : endif
565 46296 : if (check_pd .eq. 1) then
566 0 : check_pd = 0
567 0 : do i = 1, na
568 0 : if (ev(i) .gt. THRESHOLD) then
569 0 : check_pd = check_pd + 1
570 : endif
571 : enddo
572 0 : if (check_pd .lt. na) then
573 : ! not positiv definite => eigenvectors needed
574 0 : do_trans_to_band = .true.
575 0 : do_trans_to_full = .true.
576 : else
577 0 : do_trans_to_band = .false.
578 0 : do_trans_to_full = .false.
579 : endif
580 : endif
581 : endif ! eigenvalues only
582 :
583 47448 : if (do_trans_to_band) then
584 : #if COMPLEXCASE == 1
585 : ! q must be given thats why from here on we can use q and not q_actual
586 :
587 18432 : q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
588 :
589 18432 : deallocate(q_real, stat=istat, errmsg=errorMessage)
590 18432 : if (istat .ne. 0) then
591 : print *,"solve_evp_&
592 : &MATH_DATATYPE&
593 0 : &_2stage: error when deallocating q_real"//errorMessage
594 0 : stop 1
595 : endif
596 : #endif
597 :
598 : ! Backtransform stage 1
599 46296 : call obj%timer%start("trans_ev_to_band")
600 :
601 : call trans_ev_tridi_to_band_&
602 : &MATH_DATATYPE&
603 : &_&
604 : &PRECISION &
605 : (obj, na, nev, nblk, nbw, q, &
606 : q_dev, &
607 : ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi, &
608 46296 : success=success, kernel=kernel)
609 46296 : call obj%timer%stop("trans_ev_to_band")
610 :
611 46296 : if (.not.(success)) return
612 :
613 : ! We can now deallocate the stored householder vectors
614 46296 : deallocate(hh_trans, stat=istat, errmsg=errorMessage)
615 46296 : if (istat .ne. 0) then
616 : print *, "solve_evp_&
617 : &MATH_DATATYPE&
618 : &_2stage_&
619 0 : &PRECISION " // ": error when deallocating hh_trans "//errorMessage
620 0 : stop 1
621 : endif
622 : endif ! do_trans_to_band
623 :
624 47448 : if (do_trans_to_full) then
625 45432 : call obj%timer%start("trans_ev_to_full")
626 45432 : if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi) ) then
627 : ! copy to device if we want to continue on GPU
628 0 : successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
629 :
630 0 : successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
631 : endif
632 :
633 : ! Backtransform stage 2
634 :
635 : call trans_ev_band_to_full_&
636 : &MATH_DATATYPE&
637 : &_&
638 : &PRECISION &
639 : (obj, na, nev, nblk, nbw, a, &
640 : a_dev, lda, tmat, tmat_dev, q, &
641 : q_dev, &
642 : ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
643 : #if REALCASE == 1
644 : , useQRActual &
645 : #endif
646 45432 : )
647 :
648 :
649 45432 : deallocate(tmat, stat=istat, errmsg=errorMessage)
650 45432 : if (istat .ne. 0) then
651 : print *,"solve_evp_&
652 : &MATH_DATATYPE&
653 : &_2stage_&
654 0 : &PRECISION " // ": error when deallocating tmat"//errorMessage
655 0 : stop 1
656 : endif
657 45432 : call obj%timer%stop("trans_ev_to_full")
658 : endif ! do_trans_to_full
659 :
660 47448 : if (obj%eigenvalues_only) then
661 1152 : deallocate(q_dummy, stat=istat, errmsg=errorMessage)
662 1152 : if (istat .ne. 0) then
663 : print *,"solve_evp_&
664 : &MATH_DATATYPE&
665 : &_1stage_&
666 : &PRECISION&
667 0 : &" // ": error when deallocating q_dummy "//errorMessage
668 0 : stop 1
669 : endif
670 : endif
671 :
672 : call obj%timer%stop("elpa_solve_evp_&
673 : &MATH_DATATYPE&
674 : &_2stage_&
675 : &PRECISION&
676 47448 : &")
677 : 1 format(a,f10.3)
678 :
679 : end function elpa_solve_evp_&
680 : &MATH_DATATYPE&
681 : &_2stage_&
682 : &PRECISION&
683 94896 : &_impl
684 :
685 : ! vim: syntax=fortran
|