Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : !
47 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48 : !
49 : ! Copyright of the original code rests with the authors inside the ELPA
50 : ! consortium. The copyright of any additional modifications shall rest
51 : ! with their original authors, but shall adhere to the licensing terms
52 : ! distributed along with the original code in the file "COPYING".
53 :
54 :
55 :
56 : ! ELPA2 -- 2-stage solver for ELPA
57 : !
58 : ! Copyright of the original code rests with the authors inside the ELPA
59 : ! consortium. The copyright of any additional modifications shall rest
60 : ! with their original authors, but shall adhere to the licensing terms
61 : ! distributed along with the original code in the file "COPYING".
62 : #endif
63 : subroutine bandred_&
64 : &MATH_DATATYPE&
65 : &_&
66 46584 : &PRECISION &
67 46584 : (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
68 : tmat_dev, wantDebug, useGPU, success &
69 : #if REALCASE == 1
70 : , useQR)
71 : #endif
72 : #if COMPLEXCASE == 1
73 : )
74 : #endif
75 :
76 : !-------------------------------------------------------------------------------
77 : ! bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 : !
79 : ! Parameters
80 : !
81 : ! na Order of matrix
82 : !
83 : ! a(lda,matrixCols) Distributed matrix which should be reduced.
84 : ! Distribution is like in Scalapack.
85 : ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
86 : ! a(:,:) is overwritten on exit with the band and the Householder vectors
87 : ! in the upper half.
88 : !
89 : ! lda Leading dimension of a
90 : ! matrixCols local columns of matrix a
91 : !
92 : ! nblk blocksize of cyclic distribution, must be the same in both directions!
93 : !
94 : ! nbw semi bandwith of output matrix
95 : !
96 : ! mpi_comm_rows
97 : ! mpi_comm_cols
98 : ! MPI-Communicators for rows/columns
99 : !
100 : ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
101 : ! Factors for the Householder vectors (returned), needed for back transformation
102 : !
103 : !-------------------------------------------------------------------------------
104 :
105 : use cuda_functions
106 : use iso_c_binding
107 : use elpa1_compute
108 : #ifdef WITH_OPENMP
109 : use omp_lib
110 : #endif
111 : use precision
112 : use elpa_abstract_impl
113 : implicit none
114 : #include "../general/precision_kinds.F90"
115 : class(elpa_abstract_impl_t), intent(inout) :: obj
116 : integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
117 :
118 : #ifdef USE_ASSUMED_SIZE
119 : MATH_DATATYPE(kind=rck) :: a(lda,*), tmat(nbw,nbw,*)
120 : #else
121 : MATH_DATATYPE(kind=rck) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122 : #endif
123 :
124 : #if REALCASE == 1
125 : real(kind=rk) :: eps
126 : #endif
127 : logical, intent(in) :: useGPU
128 :
129 : integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
130 : integer(kind=ik) :: l_cols, l_rows
131 : #if REALCASE == 1
132 : integer(kind=ik) :: vmrCols
133 : #endif
134 : #ifdef WITH_OPENMP
135 : integer(kind=ik) :: mynlc, lrs, transformChunkSize
136 : #endif
137 : integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 : integer(kind=ik) :: istep, ncol, lch, lcx, nlc
139 : integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
140 :
141 : real(kind=rk) :: vnorm2
142 186336 : MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143 :
144 : ! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 139752 : MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
146 139752 : MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
147 46584 : MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 :
149 : #if REALCASE == 1
150 : ! needed for blocked QR decomposition
151 : integer(kind=ik) :: PQRPARAM(11), work_size
152 : real(kind=rk) :: dwork_size(1)
153 56304 : real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
154 : #endif
155 : ! a_dev is passed from bandred_real to trans_ev_band
156 : integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157 : #ifdef WITH_MPI
158 : integer(kind=ik), external :: numroc
159 : #endif
160 : integer(kind=ik) :: ierr
161 : integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
162 : integer(kind=c_intptr_t) :: lc_start, lc_end
163 : #if COMPLEXCASE == 1
164 : integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
165 : #endif
166 : integer(kind=ik) :: lr_end
167 : integer(kind=ik) :: na_cols
168 : #if COMPLEXCASE == 1
169 : integer(kind=ik) :: na_rows
170 : #endif
171 :
172 : logical, intent(in) :: wantDebug
173 : logical, intent(out) :: success
174 : logical :: successCUDA
175 : integer(kind=ik) :: istat
176 : character(200) :: errorMessage
177 :
178 : #if REALCASE == 1
179 : logical, intent(in) :: useQR
180 : #endif
181 : integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
182 : ii, pp
183 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
184 : &PRECISION&
185 : &_&
186 : &MATH_DATATYPE
187 :
188 : call obj%timer%start("bandred_&
189 : &MATH_DATATYPE&
190 : &" // &
191 : &PRECISION_SUFFIX &
192 46584 : )
193 46584 : if (wantDebug) call obj%timer%start("mpi_communication")
194 :
195 46584 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
196 46584 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
197 46584 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
198 46584 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
199 :
200 46584 : if (wantDebug) call obj%timer%stop("mpi_communication")
201 46584 : success = .true.
202 :
203 :
204 : ! Semibandwith nbw must be a multiple of blocksize nblk
205 46584 : if (mod(nbw,nblk)/=0) then
206 0 : if (my_prow==0 .and. my_pcol==0) then
207 0 : if (wantDebug) then
208 : write(error_unit,*) 'ELPA2_bandred_&
209 : &MATH_DATATYPE&
210 0 : &: ERROR: nbw=',nbw,', nblk=',nblk
211 : write(error_unit,*) 'ELPA2_bandred_&
212 : &MATH_DATATYPE&
213 0 : &: ELPA2 works only for nbw==n*nblk'
214 : endif
215 0 : success = .false.
216 0 : return
217 : endif
218 : endif
219 :
220 : ! na_rows in used nowhere; only na_cols
221 46584 : if (useGPU) then
222 : #ifdef WITH_MPI
223 : #if COMPLEXCASE == 1
224 0 : na_rows = numroc(na, nblk, my_prow, 0, np_rows)
225 : #endif
226 0 : na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
227 : #else
228 : #if COMPLEXCASE == 1
229 0 : na_rows = na
230 : #endif
231 0 : na_cols = na
232 : #endif /* WITH_MPI */
233 :
234 : ! Here we convert the regular host array into a pinned host array
235 0 : successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
236 0 : if (.not.(successCUDA)) then
237 : print *,"bandred_&
238 : &MATH_DATATYPE&
239 0 : &: error in cudaMalloc a_dev 1"
240 0 : stop 1
241 : endif
242 :
243 0 : successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
244 0 : if (.not.(successCUDA)) then
245 : print *,"bandred_&
246 : &MATH_DATATYPE&
247 0 : &: error in cudaMalloc tmat_dev 1"
248 0 : stop 1
249 : endif
250 :
251 0 : successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
252 0 : if (.not.(successCUDA)) then
253 : print *,"bandred_&
254 : &MATH_DATATYPE&
255 0 : &: error in cudaMalloc vav_dev 1"
256 0 : stop 1
257 : endif
258 : endif ! useGPU
259 :
260 : ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
261 :
262 46584 : tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
263 46584 : tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
264 :
265 46584 : l_rows_tile = tile_size/np_rows ! local rows of a tile
266 46584 : l_cols_tile = tile_size/np_cols ! local cols of a tile
267 :
268 : #if REALCASE == 1
269 28152 : if (useQR) then
270 :
271 0 : if (useGPU) then
272 0 : print *,"qr decomposition at the moment not supported with GPU"
273 0 : stop 1
274 : endif
275 :
276 0 : if (which_qr_decomposition == 1) then
277 0 : call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
278 0 : allocate(tauvector(na), stat=istat, errmsg=errorMessage)
279 0 : if (istat .ne. 0) then
280 0 : print *,"bandred_real: error when allocating tauvector "//errorMessage
281 0 : stop 1
282 : endif
283 :
284 0 : allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
285 0 : if (istat .ne. 0) then
286 0 : print *,"bandred_real: error when allocating blockheuristic "//errorMessage
287 0 : stop 1
288 : endif
289 :
290 0 : l_rows = local_index(na, my_prow, np_rows, nblk, -1)
291 0 : allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
292 0 : if (istat .ne. 0) then
293 0 : print *,"bandred_real: error when allocating vmrCPU "//errorMessage
294 0 : stop 1
295 : endif
296 :
297 0 : vmrCols = na
298 :
299 : #ifdef USE_ASSUMED_SIZE_QR
300 : call qr_pdgeqrf_2dcomm_&
301 : &PRECISION&
302 : &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
303 : nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
304 : mpi_comm_rows, mpi_comm_cols, blockheuristic)
305 :
306 : #else
307 : call qr_pdgeqrf_2dcomm_&
308 : &PRECISION&
309 : &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
310 : vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
311 : nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
312 0 : mpi_comm_rows, mpi_comm_cols, blockheuristic)
313 : #endif
314 :
315 0 : work_size = int(dwork_size(1))
316 0 : allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
317 0 : if (istat .ne. 0) then
318 0 : print *,"bandred_real: error when allocating work_blocked "//errorMessage
319 0 : stop 1
320 : endif
321 0 : work_blocked = 0.0_rk
322 0 : deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
323 0 : if (istat .ne. 0) then
324 0 : print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
325 0 : stop 1
326 : endif
327 :
328 : endif ! which_qr_decomposition
329 :
330 : endif ! useQr
331 : #endif /* REALCASE */
332 :
333 46584 : if (useGPU) then
334 :
335 0 : cur_l_rows = 0
336 0 : cur_l_cols = 0
337 :
338 0 : successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
339 0 : if (.not.(successCUDA)) then
340 : print *,"bandred_&
341 : &MATH_DATATYPE&
342 0 : &: error in cudaMemcpy a_dev 2"
343 0 : stop 1
344 : endif
345 : endif ! useGPU
346 :
347 :
348 222696 : do istep = (na-1)/nbw, 1, -1
349 :
350 176112 : n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
351 :
352 : ! Number of local columns/rows of remaining matrix
353 176112 : l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
354 176112 : l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
355 :
356 : ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
357 :
358 176112 : if (useGPU) then
359 0 : cur_l_rows = max(l_rows, 1)
360 0 : cur_l_cols = max(l_cols, 1)
361 :
362 0 : vmr_size = cur_l_rows * 2 * n_cols
363 0 : umc_size = cur_l_cols * 2 * n_cols
364 :
365 : ! Allocate vmr and umc only if the inew size exceeds their current capacity
366 : ! Added for FORTRAN CALLS
367 0 : if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
368 0 : if (allocated(vr)) then
369 0 : deallocate(vr, stat=istat, errmsg=errorMessage)
370 0 : if (istat .ne. 0) then
371 : print *,"bandred_&
372 : &MATH_DATATYPE&
373 0 : &: error when deallocating vr "//errorMessage
374 0 : stop 1
375 : endif
376 : endif
377 0 : allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
378 0 : if (istat .ne. 0) then
379 : print *,"bandred_&
380 : &MATH_DATATYPE&
381 0 : &: error when allocating vr "//errorMessage
382 0 : stop 1
383 : endif
384 :
385 : endif
386 :
387 0 : if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
388 0 : if (allocated(vmrCUDA)) then
389 0 : deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
390 0 : if (istat .ne. 0) then
391 : print *,"bandred_&
392 : &MATH_DATATYPE&
393 0 : &: error when allocating vmrCUDA "//errorMessage
394 0 : stop 1
395 : endif
396 :
397 0 : successCUDA = cuda_free(vmr_dev)
398 0 : if (.not.(successCUDA)) then
399 : print *,"bandred_&
400 0 : &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
401 0 : stop 1
402 : endif
403 : endif
404 :
405 0 : allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
406 :
407 0 : if (istat .ne. 0) then
408 : print *,"bandred_&
409 : &MATH_DATATYPE&
410 0 : &: error when allocating vmrCUDA "//errorMessage
411 0 : stop 1
412 : endif
413 0 : successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
414 0 : if (.not.(successCUDA)) then
415 : print *,"bandred_&
416 : &MATH_DATATYPE&
417 0 : &: error in cudaMalloc: vmr_dev2"
418 0 : stop 1
419 : endif
420 :
421 : endif
422 :
423 0 : if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
424 0 : if (allocated(umcCUDA)) then
425 0 : deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
426 0 : if (istat .ne. 0) then
427 : print *,"bandred_&
428 : &MATH_DATATYPE&
429 0 : &: error when deallocating umcCUDA "//errorMessage
430 0 : stop 1
431 : endif
432 :
433 0 : successCUDA = cuda_free(umc_dev)
434 0 : if (.not.(successCUDA)) then
435 : print *,"bandred_&
436 : &MATH_DATATYPE&
437 0 : &: error in cudaFree umc_dev 1"
438 0 : stop 1
439 : endif
440 :
441 : endif
442 :
443 0 : allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
444 :
445 0 : if (istat .ne. 0) then
446 : print *,"bandred_&
447 : &MATH_DATATYPE&
448 0 : &: error when deallocating umcCUDA "//errorMessage
449 0 : stop 1
450 : endif
451 :
452 0 : successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
453 0 : if (.not.(successCUDA)) then
454 : print *,"bandred_&
455 : &MATH_DATATYPE&
456 0 : &: error in cudaMalloc umc_dev 2"
457 0 : stop 1
458 : endif
459 :
460 : endif
461 :
462 : else ! GPU not used
463 :
464 : ! unify the the name vmr and vmrCPU, as well as vmrGPU
465 : ! the same for umcCPU and umcGPU
466 : ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
467 :
468 176112 : allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
469 176112 : if (istat .ne. 0) then
470 : print *,"bandred_&
471 : &MATH_DATATYPE&
472 0 : &: error when allocating vmrCPU "//errorMessage
473 0 : stop 1
474 : endif
475 :
476 176112 : allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
477 176112 : if (istat .ne. 0) then
478 : print *,"bandred_&
479 : &MATH_DATATYPE&
480 0 : &: error when allocating umcCPU "//errorMessage
481 0 : stop 1
482 : endif
483 :
484 176112 : allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
485 176112 : if (istat .ne. 0) then
486 : print *,"bandred_&
487 : &MATH_DATATYPE&
488 0 : &: error when allocating vr "//errorMessage
489 0 : stop 1
490 : endif
491 :
492 : endif ! use GPU
493 :
494 176112 : if (useGPU) then
495 0 : vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
496 : else
497 176112 : vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
498 : endif ! useGPU
499 :
500 176112 : vr(:) = 0.0_rck
501 176112 : tmat(:,:,istep) = 0.0_rck
502 176112 : if (useGPU) then
503 : #if REALCASE == 1
504 0 : umcCUDA(1 : umc_size) = 0.0_rck
505 : #endif
506 0 : lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
507 0 : lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
508 0 : lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
509 :
510 0 : if (lc_start .le. 0) lc_start = 1
511 :
512 : ! Here we assume that the processor grid and the block grid are aligned
513 0 : cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
514 :
515 0 : if (my_pcol == cur_pcol) then
516 : successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
517 : int((lda*size_of_datatype),kind=c_intptr_t), &
518 : (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
519 : int(lda*size_of_datatype,kind=c_intptr_t), &
520 : int(lr_end*size_of_datatype,kind=c_intptr_t), &
521 0 : int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
522 :
523 :
524 0 : if (.not.(successCUDA)) then
525 : print *,"bandred_&
526 : &MATH_DATATYPE&
527 0 : &: error in cudaMemcpy2d"
528 0 : stop 1
529 : endif
530 : endif
531 : endif ! useGPU
532 :
533 : ! Reduce current block to lower triangular form
534 : #if REALCASE == 1
535 71280 : if (useQR) then
536 0 : if (which_qr_decomposition == 1) then
537 0 : vmrCols = 2*n_cols
538 : #ifdef USE_ASSUMED_SIZE_QR
539 : call qr_pdgeqrf_2dcomm_&
540 : &PRECISION&
541 : &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
542 : na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
543 : work_size, na, n_cols, nblk, nblk, &
544 : istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
545 : 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
546 : blockheuristic)
547 :
548 : #else
549 : call qr_pdgeqrf_2dcomm_&
550 : &PRECISION&
551 : &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
552 : max(l_rows,1), vmrCols, tauvector(1:na), na, &
553 : tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
554 : work_size, na, n_cols, nblk, nblk, &
555 : istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
556 : 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
557 0 : blockheuristic)
558 : #endif
559 : endif
560 :
561 : else !useQR
562 : #endif /* REALCASE == 1 */
563 6683976 : do lc = n_cols, 1, -1
564 :
565 6554448 : ncol = istep*nbw + lc ! absolute column number of householder Vector
566 6554448 : nrow = ncol - nbw ! Absolute number of pivot row
567 :
568 6554448 : lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
569 6554448 : lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
570 :
571 6554448 : tau = 0
572 :
573 6554448 : if (nrow == 1) exit ! Nothing to do
574 :
575 6507864 : cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
576 :
577 6507864 : if (my_pcol==cur_pcol) then
578 :
579 : ! Get Vector to be transformed; distribute last element and norm of
580 : ! remaining elements to all procs in current column
581 :
582 6507864 : vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
583 :
584 6507864 : if (my_prow==prow(nrow, nblk, np_rows)) then
585 4338576 : aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
586 4338576 : aux1(2) = vr(lr)
587 : else
588 2169288 : aux1(1) = dot_product(vr(1:lr),vr(1:lr))
589 2169288 : aux1(2) = 0.0_rck
590 : endif
591 :
592 : #ifdef WITH_MPI
593 4338576 : if (wantDebug) call obj%timer%start("mpi_communication")
594 : call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
595 4338576 : MPI_SUM, mpi_comm_rows, mpierr)
596 4338576 : if (wantDebug) call obj%timer%stop("mpi_communication")
597 :
598 : #else /* WITH_MPI */
599 2169288 : aux2 = aux1 ! this should be optimized
600 : #endif
601 :
602 : #if REALCASE == 1
603 3372120 : vnorm2 = aux2(1)
604 : #endif
605 : #if COMPLEXCASE == 1
606 3135744 : vnorm2 = real(aux2(1),kind=rk)
607 : #endif
608 6507864 : vrl = aux2(2)
609 :
610 : ! Householder transformation
611 : call hh_transform_&
612 : &MATH_DATATYPE&
613 : &_&
614 : &PRECISION &
615 6507864 : (obj, vrl, vnorm2, xf, tau, wantDebug)
616 : ! Scale vr and store Householder Vector for back transformation
617 :
618 6507864 : vr(1:lr) = vr(1:lr) * xf
619 6507864 : if (my_prow==prow(nrow, nblk, np_rows)) then
620 4338576 : a(1:lr-1,lch) = vr(1:lr-1)
621 4338576 : a(lr,lch) = vrl
622 4338576 : vr(lr) = 1.0_rck
623 : else
624 2169288 : a(1:lr,lch) = vr(1:lr)
625 : endif
626 :
627 : endif
628 :
629 : ! Broadcast Householder Vector and tau along columns
630 :
631 6507864 : vr(lr+1) = tau
632 : #ifdef WITH_MPI
633 4338576 : if (wantDebug) call obj%timer%start("mpi_communication")
634 : call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
635 4338576 : cur_pcol, mpi_comm_cols, mpierr)
636 4338576 : if (wantDebug) call obj%timer%stop("mpi_communication")
637 :
638 : #endif /* WITH_MPI */
639 :
640 6507864 : if (useGPU) then
641 0 : vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
642 : else
643 6507864 : vmrCPU(1:lr,lc) = vr(1:lr)
644 : endif
645 6507864 : tau = vr(lr+1)
646 :
647 : #if REALCASE == 1
648 3372120 : tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
649 : #endif
650 : #if COMPLEXCASE == 1
651 3135744 : tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
652 : #endif
653 : ! Transform remaining columns in current block with Householder Vector
654 : ! Local dot product
655 :
656 6507864 : aux1 = 0.0_rck
657 :
658 : #ifdef WITH_OPENMP
659 : #if 0
660 : ! original complex implementation without openmp. check performance
661 : nlc = 0 ! number of local columns
662 : do j=1,lc-1
663 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
664 : if (lcx>0) then
665 : nlc = nlc+1
666 : aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
667 : endif
668 : enddo
669 :
670 : ! Get global dot products
671 : #ifdef WITH_MPI
672 : if (wantDebug) call obj%timer%start("mpi_communication")
673 : if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
674 :
675 : ! Transform
676 :
677 : nlc = 0
678 : do j=1,lc-1
679 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
680 : if (lcx>0) then
681 : nlc = nlc+1
682 : a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
683 :
684 : endif
685 : enddo
686 :
687 :
688 : if (wantDebug) call obj%timer%stop("mpi_communication")
689 :
690 : #else /* WITH_MPI */
691 : ! if (nlc>0) aux2=aux1
692 :
693 : ! Transform
694 :
695 : nlc = 0
696 : do j=1,lc-1
697 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
698 : if (lcx>0) then
699 : nlc = nlc+1
700 : a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
701 : endif
702 : enddo
703 :
704 : #endif /* WITH_MPI */
705 : !
706 : ! ! Transform
707 : !
708 : ! nlc = 0
709 : ! do j=1,lc-1
710 : ! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
711 : ! if (lcx>0) then
712 : ! nlc = nlc+1
713 : ! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
714 :
715 : ! endif
716 : ! enddo
717 : #endif /* if 0 */
718 :
719 : !Open up one omp region to avoid paying openmp overhead.
720 : !This does not help performance due to the addition of two openmp barriers around the MPI call,
721 : !But in the future this may be beneficial if these barriers are replaced with a faster implementation
722 :
723 6507864 : !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
724 3253932 : mynlc = 0 ! number of local columns
725 :
726 : !This loop does not have independent iterations,
727 : !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
728 : !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
729 : !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
730 : !the iteration number modulo the number of threads
731 73733904 : do j=1,lc-1
732 70479972 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
733 70479972 : if (lcx>0 ) then
734 70479972 : mynlc = mynlc+1
735 70479972 : if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
736 70479972 : if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
737 : endif
738 : endif
739 : enddo
740 :
741 : ! Get global dot products
742 :
743 3253932 : !$omp barrier
744 : !$omp single
745 : #ifdef WITH_MPI
746 2169288 : if (wantDebug) call obj%timer%start("mpi_communication")
747 2169288 : if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
748 2126112 : MPI_SUM, mpi_comm_rows, mpierr)
749 2169288 : if (wantDebug) call obj%timer%stop("mpi_communication")
750 : #else /* WITH_MPI */
751 1084644 : if (mynlc>0) aux2 = aux1
752 : #endif /* WITH_MPI */
753 : !$omp end single
754 3253932 : !$omp barrier
755 :
756 : ! Transform
757 3253932 : transformChunkSize=32
758 3253932 : mynlc = 0
759 73733904 : do j=1,lc-1
760 70479972 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
761 70479972 : if (lcx>0) then
762 70479972 : mynlc = mynlc+1
763 : !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
764 : !However, for some reason this is slower than doing it manually, so it is parallelized as below.
765 479168940 : do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
766 9942516564 : do pp = 1,transformChunkSize
767 9674787540 : if (pp + ii > lr) exit
768 : #if REALCASE == 1
769 6205992816 : a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
770 : #endif
771 : #if COMPLEXCASE == 1
772 3398314752 : a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
773 : #endif
774 : enddo
775 : enddo
776 : endif
777 : enddo
778 : !$omp end parallel
779 :
780 : #else /* WITH_OPENMP */
781 :
782 3253932 : nlc = 0 ! number of local columns
783 73733904 : do j=1,lc-1
784 70479972 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
785 70479972 : if (lcx>0) then
786 70479972 : nlc = nlc+1
787 70479972 : if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
788 : endif
789 : enddo
790 :
791 : ! Get global dot products
792 : #ifdef WITH_MPI
793 2169288 : if (wantDebug) call obj%timer%start("mpi_communication")
794 2169288 : if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
795 2126112 : MPI_SUM, mpi_comm_rows, mpierr)
796 2169288 : if (wantDebug) call obj%timer%stop("mpi_communication")
797 : #else /* WITH_MPI */
798 1084644 : if (nlc>0) aux2=aux1
799 : #endif /* WITH_MPI */
800 : ! Transform
801 :
802 3253932 : nlc = 0
803 73733904 : do j=1,lc-1
804 70479972 : lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
805 70479972 : if (lcx>0) then
806 70479972 : nlc = nlc+1
807 : #if REALCASE == 1
808 47040804 : a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
809 : #endif
810 : #if COMPLEXCASE == 1
811 23439168 : a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
812 : #endif
813 : endif
814 : enddo
815 : #endif /* WITH_OPENMP */
816 : enddo ! lc
817 :
818 176112 : if (useGPU) then
819 : ! store column tiles back to GPU
820 0 : cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
821 0 : if (my_pcol == cur_pcol) then
822 : successCUDA = cuda_memcpy2d((a_dev+ &
823 : int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
824 : int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
825 : int(lda*size_of_datatype,kind=c_intptr_t), &
826 : int(lr_end*size_of_datatype,kind=c_intptr_t), &
827 : int((lc_end - lc_start+1),kind=c_intptr_t), &
828 0 : int(cudaMemcpyHostToDevice,kind=c_int))
829 :
830 0 : if (.not.(successCUDA)) then
831 : print *, "bandred_&
832 : &MATH_DATATYPE&
833 0 : &: cuda memcpy a_dev failed ", istat
834 0 : stop 1
835 : endif
836 : endif
837 : endif
838 :
839 : ! Calculate scalar products of stored Householder vectors.
840 : ! This can be done in different ways, we use dsyrk
841 :
842 176112 : vav = 0
843 176112 : call obj%timer%start("blas")
844 176112 : if (useGPU) then
845 0 : if (l_rows>0) &
846 : #if REALCASE == 1
847 : call PRECISION_SYRK('U', 'T', &
848 : #endif
849 : #if COMPLEXCASE == 1
850 : call PRECISION_HERK('U', 'C', &
851 : #endif
852 : n_cols, l_rows, ONE, &
853 : vmrCUDA, cur_l_rows, &
854 0 : ZERO, vav, ubound(vav,dim=1))
855 :
856 : else ! useGPU
857 176112 : if (l_rows>0) &
858 : #if REALCASE == 1
859 : call PRECISION_SYRK('U', 'T', &
860 : #endif
861 : #if COMPLEXCASE == 1
862 : call PRECISION_HERK('U', 'C', &
863 : #endif
864 176112 : n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
865 : endif
866 176112 : call obj%timer%stop("blas")
867 : #if REALCASE == 1
868 : call symm_matrix_allreduce_&
869 : #endif
870 : #if COMPLEXCASE == 1
871 : call herm_matrix_allreduce_&
872 : #endif
873 : &PRECISION &
874 176112 : (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
875 : ! Calculate triangular matrix T for block Householder Transformation
876 176112 : call obj%timer%start("blas")
877 6730560 : do lc=n_cols,1,-1
878 6554448 : tau = tmat(lc,lc,istep)
879 6554448 : if (lc<n_cols) then
880 : call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
881 6378336 : n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
882 :
883 : #if REALCASE == 1
884 3328992 : tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
885 : #endif
886 : #if COMPLEXCASE == 1
887 3049344 : tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
888 : #endif
889 : endif
890 : enddo
891 176112 : call obj%timer%stop("blas")
892 : #if REALCASE == 1
893 : endif !useQR
894 : #endif
895 : ! Transpose vmr -> vmc (stored in umc, second half)
896 176112 : if (useGPU) then
897 : call elpa_transpose_vectors_&
898 : &MATH_DATATYPE&
899 : &_&
900 : &PRECISION &
901 : (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
902 : umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
903 0 : mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
904 : else ! useGPU
905 : call elpa_transpose_vectors_&
906 : &MATH_DATATYPE&
907 : &_&
908 : &PRECISION &
909 : (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
910 : umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
911 176112 : 1, istep*nbw, n_cols, nblk)
912 : endif
913 :
914 : ! Calculate umc = A**T * vmr
915 : ! Note that the distributed A has to be transposed
916 : ! Opposed to direct tridiagonalization there is no need to use the cache locality
917 : ! of the tiles, so we can use strips of the matrix
918 :
919 :
920 : #if 0
921 : ! original complex implemetation check for performance
922 : umcCPU(1:l_cols,1:n_cols) = 0.0_rck
923 : vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
924 :
925 : if (l_cols>0 .and. l_rows>0) then
926 : do i=0,(istep*nbw-1)/tile_size
927 :
928 : lcs = i*l_cols_tile+1
929 : lce = min(l_cols,(i+1)*l_cols_tile)
930 : if (lce<lcs) cycle
931 :
932 : lre = min(l_rows,(i+1)*l_rows_tile)
933 :
934 : call obj%timer%start("blas")
935 : call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
936 : vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
937 : call obj%timer%stop("blas")
938 :
939 : if (i==0) cycle
940 : lre = min(l_rows,i*l_rows_tile)
941 : call obj%timer%start("blas")
942 : call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
943 : umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
944 : call obj%timer%stop("blas")
945 : enddo
946 :
947 : endif ! (l_cols>0 .and. l_rows>0)
948 : #endif /* if 0 */
949 :
950 : !Code for Algorithm 4
951 :
952 : ! n_way is actually a branch for the number of OpenMP threads
953 176112 : n_way = 1
954 : #ifdef WITH_OPENMP
955 :
956 : #if REALCASE == 1
957 35640 : n_way = omp_get_max_threads()
958 :
959 71280 : !$omp parallel private( i,lcs,lce,lrs,lre)
960 : #endif
961 88056 : if (n_way > 1) then
962 : #if REALCASE == 1
963 0 : !$omp do
964 : #endif
965 0 : do i=1,min(l_cols_tile, l_cols)
966 0 : umcCPU(i,1:n_cols) = 0.0_rck
967 : enddo
968 :
969 : #if REALCASE == 1
970 0 : !$omp do
971 : #endif
972 0 : do i=1,l_rows
973 0 : vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
974 : enddo
975 :
976 0 : if (l_cols>0 .and. l_rows>0) then
977 :
978 : !SYMM variant 4
979 : !Partitioned Matrix Expression:
980 : ! Ct = Atl Bt + Atr Bb
981 : ! Cb = Atr' Bt + Abl Bb
982 : !
983 : !Loop invariant:
984 : ! Ct = Atl Bt + Atr Bb
985 : !
986 : !Update:
987 : ! C1 = A10'B0 + A11B1 + A21 B2
988 : !
989 : !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
990 : !is easily parallelized, and regardless of choise of algorithm,
991 : !the startup cost for parallelizing the dgemms inside the loop is too great
992 : #if REALCASE == 1
993 0 : !$omp do schedule(static,1)
994 : #endif
995 0 : do i=0,(istep*nbw-1)/tile_size
996 0 : lcs = i*l_cols_tile+1 ! local column start
997 0 : lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
998 :
999 0 : lrs = i*l_rows_tile+1 ! local row start
1000 0 : lre = min(l_rows, (i+1)*l_rows_tile) ! local row end
1001 :
1002 : !C1 += [A11 A12] [B1
1003 : ! B2]
1004 0 : if ( lre > lrs .and. l_cols > lcs ) then
1005 0 : call obj%timer%start("blas")
1006 : call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1007 : ONE, a(lrs,lcs), ubound(a,dim=1), &
1008 : umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
1009 0 : ZERO, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
1010 0 : call obj%timer%stop("blas")
1011 : endif
1012 :
1013 : ! C1 += A10' B0
1014 0 : if ( lce > lcs .and. i > 0 ) then
1015 0 : call obj%timer%start("blas")
1016 : call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
1017 : lce-lcs+1, n_cols, lrs-1, &
1018 : ONE, a(1,lcs), ubound(a,dim=1), &
1019 : vmrCPU(1,1), ubound(vmrCPU,dim=1), &
1020 0 : ZERO, umcCPU(lcs,1), ubound(umcCPU,dim=1))
1021 0 : call obj%timer%stop("blas")
1022 : endif
1023 : enddo
1024 : endif ! l_cols>0 .and. l_rows>0
1025 :
1026 : else ! n_way > 1
1027 : #endif /* WITH_OPENMP */
1028 :
1029 176112 : if (useGPU) then
1030 0 : umcCUDA(1 : l_cols * n_cols) = 0.0_rck
1031 0 : vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0.0_rck
1032 : else ! useGPU
1033 1876248 : umcCPU(1:l_cols,1:n_cols) = 0.0_rck
1034 176112 : vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
1035 : endif ! useGPU
1036 :
1037 176112 : if (l_cols>0 .and. l_rows>0) then
1038 :
1039 176112 : if (useGPU) then
1040 : successCUDA = cuda_memcpy(vmr_dev, &
1041 : loc(vmrCUDA(1)),&
1042 0 : vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
1043 0 : if (.not.(successCUDA)) then
1044 : print *,"bandred_&
1045 : &MATH_DATATYPE&
1046 0 : &: error in cudaMemcpy vmr_dev 3"
1047 0 : stop 1
1048 : endif
1049 :
1050 : successCUDA = cuda_memcpy(umc_dev, &
1051 : loc(umcCUDA(1)), &
1052 0 : umc_size*size_of_datatype,cudaMemcpyHostToDevice)
1053 0 : if (.not.(successCUDA)) then
1054 : print *,"bandred_&
1055 : &MATH_DATATYPE&
1056 0 : &: error in cudaMemcpy umc_dev 3"
1057 0 : stop 1
1058 : endif
1059 : endif ! useGPU
1060 :
1061 462048 : do i=0,(istep*nbw-1)/tile_size
1062 :
1063 285936 : lcs = i*l_cols_tile+1
1064 303408 : lce = min(l_cols,(i+1)*l_cols_tile)
1065 285936 : if (lce<lcs) cycle
1066 303408 : lre = min(l_rows,(i+1)*l_rows_tile)
1067 :
1068 285936 : if (useGPU) then
1069 0 : call obj%timer%start("cublas")
1070 : call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
1071 : lce-lcs+1, n_cols, lre, &
1072 : ONE, (a_dev + ((lcs-1)*lda* &
1073 : size_of_datatype)), &
1074 : lda, vmr_dev,cur_l_rows, &
1075 : ONE, (umc_dev+ (lcs-1)* &
1076 : size_of_datatype), &
1077 0 : cur_l_cols)
1078 :
1079 0 : call obj%timer%stop("cublas")
1080 :
1081 0 : if(i==0) cycle
1082 0 : call obj%timer%start("cublas")
1083 :
1084 0 : lre = min(l_rows,i*l_rows_tile)
1085 : call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1, ONE, &
1086 : (a_dev+ ((lcs-1)*lda* &
1087 : size_of_datatype)), &
1088 : lda, (umc_dev+(cur_l_cols * n_cols+lcs-1)* &
1089 : size_of_datatype), &
1090 : cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* &
1091 : size_of_datatype), &
1092 0 : cur_l_rows)
1093 0 : call obj%timer%stop("cublas")
1094 : else ! useGPU
1095 :
1096 285936 : call obj%timer%start("blas")
1097 : call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
1098 : lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
1099 285936 : vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
1100 285936 : call obj%timer%stop("blas")
1101 285936 : if (i==0) cycle
1102 127296 : lre = min(l_rows,i*l_rows_tile)
1103 109824 : call obj%timer%start("blas")
1104 : call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
1105 : umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, &
1106 109824 : vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
1107 109824 : call obj%timer%stop("blas")
1108 : endif ! useGPU
1109 : enddo ! i=0,(istep*nbw-1)/tile_size
1110 :
1111 176112 : if (useGPU) then
1112 : successCUDA = cuda_memcpy( &
1113 : loc(vmrCUDA(1)), &
1114 0 : vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
1115 0 : if (.not.(successCUDA)) then
1116 : print *,"bandred_&
1117 : &MATH_DATATYPE&
1118 0 : &: error in cudaMemcpy vmr_dev 4"
1119 0 : stop 1
1120 : endif
1121 :
1122 : successCUDA = cuda_memcpy( &
1123 : loc(umcCUDA(1)), &
1124 0 : umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
1125 0 : if (.not.(successCUDA)) then
1126 : print *,"bandred_&
1127 : &MATH_DATATYPE&
1128 0 : &: error in cudaMemcpy umc_dev 4"
1129 0 : stop 1
1130 : endif
1131 : endif ! useGPU
1132 : endif ! l_cols>0 .and. l_rows>0
1133 :
1134 : #ifdef WITH_OPENMP
1135 : endif ! n_way > 1
1136 : #if REALCASE == 1
1137 : !$omp end parallel
1138 : #endif
1139 : #endif
1140 : ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
1141 : ! on the processors containing the diagonal
1142 : ! This is only necessary if ur has been calculated, i.e. if the
1143 : ! global tile size is smaller than the global remaining matrix
1144 :
1145 : ! Or if we used the Algorithm 4
1146 176112 : if (tile_size < istep*nbw .or. n_way > 1) then
1147 :
1148 41472 : if (useGPU) then
1149 :
1150 : call elpa_reduce_add_vectors_&
1151 : &MATH_DATATYPE&
1152 : &_&
1153 : &PRECISION &
1154 : (obj, vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows, &
1155 : mpi_comm_rows, umcCUDA, &
1156 0 : cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk)
1157 : else ! useGPU
1158 :
1159 : call elpa_reduce_add_vectors_&
1160 : &MATH_DATATYPE&
1161 : &_&
1162 : &PRECISION &
1163 : (obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
1164 : umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
1165 41472 : istep*nbw, n_cols, nblk)
1166 : endif ! useGPU
1167 : endif ! tile_size < istep*nbw .or. n_way > 1
1168 :
1169 176112 : if (l_cols>0) then
1170 :
1171 176112 : if (useGPU) then
1172 : #ifdef WITH_MPI
1173 0 : allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
1174 0 : if (istat .ne. 0) then
1175 : print *,"bandred_&
1176 : &MATH_DATATYPE&
1177 0 : &: error when allocating tmpCUDA "//errorMessage
1178 0 : stop 1
1179 : endif
1180 :
1181 0 : if (wantDebug) call obj%timer%start("mpi_communication")
1182 :
1183 : call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_MATH_DATATYPE_PRECISION, &
1184 0 : MPI_SUM, mpi_comm_rows, ierr)
1185 :
1186 0 : umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
1187 0 : if (wantDebug) call obj%timer%stop("mpi_communication")
1188 : #else /* WITH_MPI */
1189 :
1190 : ! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
1191 :
1192 : #endif /* WITH_MPI */
1193 :
1194 0 : if (allocated(tmpCUDA)) then
1195 0 : deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
1196 0 : if (istat .ne. 0) then
1197 : print *,"bandred_&
1198 : &MATH_DATATYPE&
1199 0 : &: error when deallocating tmpCUDA "//errorMessage
1200 0 : stop 1
1201 : endif
1202 : endif
1203 :
1204 : else ! useGPU
1205 :
1206 176112 : allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
1207 176112 : if (istat .ne. 0) then
1208 : print *,"bandred_&
1209 : &MATH_DATATYPE&
1210 0 : &: error when allocating tmpCPU "//errorMessage
1211 0 : stop 1
1212 : endif
1213 :
1214 : #ifdef WITH_MPI
1215 117408 : if (wantDebug) call obj%timer%start("mpi_communication")
1216 : call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_MATH_DATATYPE_PRECISION, &
1217 117408 : MPI_SUM, mpi_comm_rows, mpierr)
1218 117408 : umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
1219 117408 : if (wantDebug) call obj%timer%stop("mpi_communication")
1220 : #else /* WITH_MPI */
1221 : ! tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
1222 : #endif /* WITH_MPI */
1223 :
1224 176112 : deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
1225 176112 : if (istat .ne. 0) then
1226 : print *,"bandred_&
1227 : &MATH_DATATYPE&
1228 0 : &: error when deallocating tmpCPU "//errorMessage
1229 0 : stop 1
1230 : endif
1231 : endif ! useGPU
1232 : endif ! l_cols > 0
1233 :
1234 : ! U = U * Tmat**T
1235 :
1236 176112 : if (useGPU) then
1237 : successCUDA = cuda_memcpy(umc_dev, &
1238 : loc(umcCUDA(1)), &
1239 0 : umc_size*size_of_datatype, cudaMemcpyHostToDevice)
1240 0 : if (.not.(successCUDA)) then
1241 : print *,"bandred_&
1242 : &MATH_DATATYPE&
1243 0 : &: error in cudaMemcpy umc_dev 5"
1244 0 : stop 1
1245 : endif
1246 0 : successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
1247 0 : if (.not.(successCUDA)) then
1248 : print *,"bandred_&
1249 : &MATH_DATATYPE&
1250 0 : &: error in cudaMemcpy tmat_dev 2"
1251 0 : stop 1
1252 : endif
1253 :
1254 0 : call obj%timer%start("cublas")
1255 : call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
1256 0 : l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
1257 0 : call obj%timer%stop("cublas")
1258 :
1259 : ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
1260 0 : successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
1261 0 : if (.not.(successCUDA)) then
1262 : print *,"bandred_&
1263 : &MATH_DATATYPE&
1264 0 : &: error in cudaMemcpy vav_dev 2"
1265 0 : stop 1
1266 : endif
1267 0 : call obj%timer%start("cublas")
1268 :
1269 : call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
1270 : n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
1271 : (umc_dev+(cur_l_cols * n_cols )*size_of_datatype),cur_l_cols, &
1272 0 : ZERO, vav_dev, nbw)
1273 :
1274 : call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
1275 0 : n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
1276 0 : call obj%timer%stop("cublas")
1277 :
1278 0 : successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
1279 0 : if (.not.(successCUDA)) then
1280 : print *,"bandred_&
1281 : &MATH_DATATYPE&
1282 0 : &: error in cudaMemcpy vav_dev3"
1283 0 : stop 1
1284 : endif
1285 : else ! useGPU
1286 :
1287 176112 : call obj%timer%start("blas")
1288 :
1289 : call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
1290 : l_cols,n_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), &
1291 176112 : umcCPU, ubound(umcCPU,dim=1))
1292 :
1293 : ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
1294 :
1295 : call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
1296 : n_cols, n_cols, l_cols, ONE, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
1297 176112 : ubound(umcCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
1298 :
1299 : call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
1300 : n_cols, n_cols, ONE, tmat(1,1,istep), &
1301 176112 : ubound(tmat,dim=1), vav, ubound(vav,dim=1))
1302 176112 : call obj%timer%stop("blas")
1303 :
1304 : endif ! useGPU
1305 :
1306 : #if REALCASE == 1
1307 : call symm_matrix_allreduce_&
1308 : #endif
1309 : #if COMPLEXCASE == 1
1310 : call herm_matrix_allreduce_&
1311 : #endif
1312 : &PRECISION &
1313 176112 : (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
1314 :
1315 176112 : if (useGPU) then
1316 0 : successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
1317 0 : if (.not.(successCUDA)) then
1318 : print *,"bandred_&
1319 : &MATH_DATATYPE&
1320 0 : &: error in cudaMemcpy vav_dev4"
1321 0 : stop 1
1322 : endif
1323 : endif
1324 :
1325 : ! U = U - 0.5 * V * VAV
1326 :
1327 176112 : if (useGPU) then
1328 0 : call obj%timer%start("cublas")
1329 :
1330 : call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
1331 : #if REALCASE == 1
1332 : -0.5_rk, &
1333 : #endif
1334 : #if COMPLEXCASE == 1
1335 : (-0.5_rk, 0.0_rk), &
1336 : #endif
1337 : (umc_dev+(cur_l_cols * n_cols )* &
1338 : size_of_datatype), &
1339 : cur_l_cols, vav_dev,nbw, &
1340 0 : ONE, umc_dev, cur_l_cols)
1341 0 : call obj%timer%stop("cublas")
1342 :
1343 : successCUDA = cuda_memcpy( &
1344 : loc(umcCUDA(1)), &
1345 0 : umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
1346 :
1347 0 : if (.not.(successCUDA)) then
1348 : print *,"bandred_&
1349 : &MATH_DATATYPE&
1350 0 : &: error in cudaMemcpy umc_dev 6"
1351 0 : stop 1
1352 : endif
1353 :
1354 : ! Transpose umc -> umr (stored in vmr, second half)
1355 : call elpa_transpose_vectors_&
1356 : &MATH_DATATYPE&
1357 : &_&
1358 : &PRECISION &
1359 : (obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
1360 : vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1361 0 : 1, istep*nbw, n_cols, nblk)
1362 :
1363 : successCUDA = cuda_memcpy(vmr_dev, &
1364 : loc(vmrCUDA(1)), &
1365 0 : vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
1366 0 : if (.not.(successCUDA)) then
1367 : print *,"bandred_&
1368 : &MATH_DATATYPE&
1369 0 : &: error in cudaMemcpy vmr_dev 5 "
1370 0 : stop 1
1371 : endif
1372 :
1373 : successCUDA = cuda_memcpy(umc_dev, &
1374 : loc(umcCUDA(1)), &
1375 0 : umc_size*size_of_datatype, cudaMemcpyHostToDevice)
1376 0 : if (.not.(successCUDA)) then
1377 : print *,"bandred_&
1378 : &MATH_DATATYPE&
1379 0 : &: error in cudaMemcpy umc_dev 7"
1380 0 : stop 1
1381 : endif
1382 : else ! useGPU
1383 176112 : call obj%timer%start("blas")
1384 : call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, &
1385 : #if REALCASE == 1
1386 : -0.5_rk, &
1387 : #endif
1388 : #if COMPLEXCASE == 1
1389 : (-0.5_rk, 0.0_rk), &
1390 : #endif
1391 : umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
1392 176112 : ubound(vav,dim=1), ONE, umcCPU, ubound(umcCPU,dim=1))
1393 :
1394 176112 : call obj%timer%stop("blas")
1395 : ! Transpose umc -> umr (stored in vmr, second half)
1396 : call elpa_transpose_vectors_&
1397 : &MATH_DATATYPE&
1398 : &_&
1399 : &PRECISION &
1400 : (obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
1401 : vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1402 176112 : 1, istep*nbw, n_cols, nblk)
1403 :
1404 : endif ! useGPU
1405 :
1406 :
1407 : ! A = A - V*U**T - U*V**T
1408 :
1409 : #ifdef WITH_OPENMP
1410 176112 : !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
1411 88056 : n_threads = omp_get_num_threads()
1412 88056 : if (mod(n_threads, 2) == 0) then
1413 0 : n_way = 2
1414 : else
1415 88056 : n_way = 1
1416 : endif
1417 :
1418 88056 : m_way = n_threads / n_way
1419 :
1420 88056 : m_id = mod(omp_get_thread_num(), m_way)
1421 88056 : n_id = omp_get_thread_num() / m_way
1422 :
1423 319080 : do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
1424 142968 : i = ii / tile_size
1425 142968 : lcs = i*l_cols_tile+1
1426 197880 : lce = min(l_cols,(i+1)*l_cols_tile)
1427 197880 : lre = min(l_rows,(i+1)*l_rows_tile)
1428 285936 : if (lce<lcs .or. lre<1) cycle
1429 :
1430 : !Figure out this thread's range
1431 142968 : work_per_thread = lre / m_way
1432 142968 : if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
1433 142968 : mystart = m_id * work_per_thread + 1
1434 142968 : myend = mystart + work_per_thread - 1
1435 142968 : if ( myend > lre ) myend = lre
1436 142968 : if ( myend-mystart+1 < 1) cycle
1437 142968 : call obj%timer%start("blas")
1438 : call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, lce-lcs+1, 2*n_cols, -ONE, &
1439 : vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
1440 142968 : ONE, a(mystart,lcs), ubound(a,1))
1441 142968 : call obj%timer%stop("blas")
1442 : enddo
1443 : !$omp end parallel
1444 : !#if COMPLEXCASE == 1
1445 : ! do i=0,(istep*nbw-1)/tile_size
1446 : ! lcs = i*l_cols_tile+1
1447 : ! lce = min(l_cols,(i+1)*l_cols_tile)
1448 : ! lre = min(l_rows,(i+1)*l_rows_tile)
1449 : ! if (lce<lcs .or. lre<1) cycle
1450 : ! call obj%timer%start("blas")
1451 : ! call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -ONE, &
1452 : ! vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
1453 : ! ONE, a(1,lcs), lda)
1454 : ! call obj%timer%stop("blas")
1455 : ! enddo
1456 : !#endif
1457 :
1458 : #else /* WITH_OPENMP */
1459 :
1460 231024 : do i=0,(istep*nbw-1)/tile_size
1461 142968 : lcs = i*l_cols_tile+1
1462 142968 : lce = min(l_cols,(i+1)*l_cols_tile)
1463 142968 : lre = min(l_rows,(i+1)*l_rows_tile)
1464 142968 : if (lce<lcs .or. lre<1) cycle
1465 :
1466 142968 : if (useGPU) then
1467 0 : call obj%timer%start("cublas")
1468 :
1469 : call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
1470 : lre, lce-lcs+1, 2*n_cols, -ONE, &
1471 : vmr_dev, cur_l_rows, (umc_dev +(lcs-1)* &
1472 : size_of_datatype), &
1473 : cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
1474 0 : size_of_datatype), lda)
1475 0 : call obj%timer%stop("cublas")
1476 :
1477 : else ! useGPU
1478 :
1479 142968 : call obj%timer%start("blas")
1480 : call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre,lce-lcs+1, 2*n_cols, -ONE, &
1481 : vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
1482 142968 : ONE, a(1,lcs), lda)
1483 142968 : call obj%timer%stop("blas")
1484 : endif ! useGPU
1485 : enddo ! i=0,(istep*nbw-1)/tile_size
1486 : #endif /* WITH_OPENMP */
1487 :
1488 176112 : if (.not.(useGPU)) then
1489 176112 : if (allocated(vr)) then
1490 176112 : deallocate(vr, stat=istat, errmsg=errorMessage)
1491 176112 : if (istat .ne. 0) then
1492 : print *,"bandred_&
1493 : &MATH_DATATYPE&
1494 0 : &: error when deallocating vr "//errorMessage
1495 0 : stop 1
1496 : endif
1497 : endif
1498 :
1499 176112 : if (allocated(umcCPU)) then
1500 176112 : deallocate(umcCPU, stat=istat, errmsg=errorMessage)
1501 176112 : if (istat .ne. 0) then
1502 : print *,"bandred_&
1503 : &MATH_DATATYPE&
1504 0 : &: error when deallocating umcCPU "//errorMessage
1505 0 : stop 1
1506 : endif
1507 : endif
1508 :
1509 176112 : if (allocated(vmrCPU)) then
1510 176112 : deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
1511 176112 : if (istat .ne. 0) then
1512 : print *,"bandred_&
1513 : &MATH_DATATYPE&
1514 0 : &: error when deallocating vmrCPU "//errorMessage
1515 0 : stop 1
1516 : endif
1517 : endif
1518 : endif !useGPU
1519 :
1520 : enddo ! istep
1521 :
1522 46584 : if (useGPU) then
1523 0 : successCUDA = cuda_free(vav_dev)
1524 0 : if (.not.(successCUDA)) then
1525 : print *,"bandred_&
1526 : &MATH_DATATYPE&
1527 0 : &: error in cudaFree vav_dev 4"
1528 0 : stop 1
1529 : endif
1530 : endif ! useGPU
1531 :
1532 46584 : if (allocated(vr)) then
1533 0 : deallocate(vr, stat=istat, errmsg=errorMessage)
1534 0 : if (istat .ne. 0) then
1535 : print *,"bandred_&
1536 : &MATH_DATATYPE&
1537 0 : &: error when deallocating vr "//errorMessage
1538 0 : stop 1
1539 : endif
1540 : endif
1541 :
1542 46584 : if (allocated(umcCPU)) then
1543 0 : deallocate(umcCPU, stat=istat, errmsg=errorMessage)
1544 0 : if (istat .ne. 0) then
1545 : print *,"bandred_&
1546 : &MATH_DATATYPE&
1547 0 : &: error when deallocating umcCPU "//errorMessage
1548 0 : stop 1
1549 : endif
1550 : endif
1551 :
1552 46584 : if (allocated(vmrCPU)) then
1553 0 : deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
1554 0 : if (istat .ne. 0) then
1555 : print *,"bandred_&
1556 : &MATH_DATATYPE&
1557 0 : &: error when deallocating vmrCPU "//errorMessage
1558 0 : stop 1
1559 : endif
1560 : endif
1561 :
1562 : !#if COMPLEXCASE == 1
1563 : ! ! check this
1564 : ! if (useGPU) then
1565 : ! successCUDA = cuda_free(umc_dev)
1566 : ! if (.not.(successCUDA)) then
1567 : ! print *,"bandred_complex: error in cudaFree umc_dev 7a"
1568 : ! stop
1569 : ! endif
1570 : ! endif
1571 : !#endif
1572 :
1573 46584 : if (useGPU) then
1574 0 : successCUDA = cuda_free(vmr_dev)
1575 0 : if (.not.(successCUDA)) then
1576 : print *,"bandred_&
1577 : &MATH_DATATYPE&
1578 0 : &: error in cudaFree vmr_dev 6"
1579 0 : stop 1
1580 : endif
1581 :
1582 0 : successCUDA = cuda_free(umc_dev)
1583 0 : if (.not.(successCUDA)) then
1584 : print *,"bandred_&
1585 : &MATH_DATATYPE&
1586 0 : &: error in cudaFree umc_dev 8"
1587 0 : stop
1588 : endif
1589 :
1590 0 : if (allocated(umcCUDA)) then
1591 0 : deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
1592 0 : if (istat .ne. 0) then
1593 : print *,"bandred_&
1594 : &MATH_DATATYPE&
1595 0 : &: error when deallocating umcCUDA "//errorMessage
1596 0 : stop 1
1597 : endif
1598 : endif
1599 0 : if (allocated(vmrCUDA)) then
1600 0 : deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
1601 0 : if (istat .ne. 0) then
1602 : print *,"bandred_&
1603 : &MATH_DATATYPE&
1604 0 : &: error when deallocating vmrCUDA "//errorMessage
1605 0 : stop 1
1606 : endif
1607 : endif
1608 : endif ! useGPU
1609 :
1610 : #if REALCASE == 1
1611 28152 : if (useQR) then
1612 0 : if (which_qr_decomposition == 1) then
1613 0 : deallocate(work_blocked, stat=istat, errmsg=errorMessage)
1614 0 : if (istat .ne. 0) then
1615 0 : print *,"bandred_real: error when deallocating work_blocked "//errorMessage
1616 0 : stop 1
1617 : endif
1618 :
1619 0 : deallocate(tauvector, stat=istat, errmsg=errorMessage)
1620 0 : if (istat .ne. 0) then
1621 0 : print *,"bandred_real: error when deallocating tauvector "//errorMessage
1622 0 : stop 1
1623 : endif
1624 : endif
1625 : endif
1626 : #endif
1627 :
1628 : call obj%timer%stop("bandred_&
1629 : &MATH_DATATYPE&
1630 : &" // &
1631 : &PRECISION_SUFFIX &
1632 46584 : )
1633 :
1634 : end subroutine bandred_&
1635 : &MATH_DATATYPE&
1636 : &_&
1637 139752 : &PRECISION
1638 : #if REALCASE == 1
1639 : ! slower for gpu on 10000 10000 ???
1640 : #endif
1641 :
|