Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : !
47 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48 : !
49 : ! Copyright of the original code rests with the authors inside the ELPA
50 : ! consortium. The copyright of any additional modifications shall rest
51 : ! with their original authors, but shall adhere to the licensing terms
52 : ! distributed along with the original code in the file "COPYING".
53 : #endif
54 :
55 : #include "../general/sanity.F90"
56 :
57 : !> \brief Transforms the eigenvectors of a tridiagonal matrix back
58 : !> to the eigenvectors of the original matrix
59 : !> (like Scalapack Routine PDORMTR)
60 : !>
61 : ! Parameters
62 : !
63 : !> \param na Order of matrix a_mat, number of rows of matrix q_mat
64 : !>
65 : !> \param nqc Number of columns of matrix q_mat
66 : !>
67 : !> \param a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
68 : !> Distribution is like in Scalapack.
69 : !>
70 : !> \param lda Leading dimension of a_mat
71 : !>
72 : !> \param tau(na) Factors of the Householder vectors
73 : !>
74 : !> \param q_mat On input: Eigenvectors of tridiagonal matrix
75 : !> On output: Transformed eigenvectors
76 : !> Distribution is like in Scalapack.
77 : !>
78 : !> \param ldq Leading dimension of q_mat
79 : !>
80 : !> \param nblk blocksize of cyclic distribution, must be the same in both directions!
81 : !>
82 : !> \param matrixCols local columns of matrix a_mat and q_mat
83 : !>
84 : !> \param mpi_comm_rows MPI-Communicator for rows
85 : !>
86 : !> \param mpi_comm_cols MPI-Communicator for columns
87 : !>
88 : !> \param useGPU If true, GPU version of the subroutine will be used
89 : !>
90 :
91 : subroutine trans_ev_&
92 : &MATH_DATATYPE&
93 : &_&
94 8064 : &PRECISION &
95 8064 : (obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
96 : use cuda_functions
97 : use iso_c_binding
98 : use precision
99 : use elpa_abstract_impl
100 : implicit none
101 : class(elpa_abstract_impl_t), intent(inout) :: obj
102 : integer(kind=ik), intent(in) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
103 : #if REALCASE == 1
104 : real(kind=REAL_DATATYPE), intent(in) :: tau(na)
105 : #endif
106 : #if COMPLEXCASE == 1
107 : complex(kind=COMPLEX_DATATYPE), intent(in) :: tau(na)
108 : #endif
109 :
110 : #if REALCASE == 1
111 : #ifdef USE_ASSUMED_SIZE
112 : real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
113 : #else
114 : real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
115 : #endif
116 : #endif
117 : #if COMPLEXCASE == 1
118 : #ifdef USE_ASSUMED_SIZE
119 : complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
120 : #else
121 : complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
122 : #endif
123 : #endif
124 : logical, intent(in) :: useGPU
125 :
126 : integer(kind=ik) :: max_stored_rows
127 :
128 : #if REALCASE == 1
129 : #ifdef DOUBLE_PRECISION_REAL
130 : real(kind=rk8), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
131 : #else
132 : real(kind=rk4), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
133 : #endif
134 : #endif
135 : #if COMPLEXCASE == 1
136 : #ifdef DOUBLE_PRECISION_COMPLEX
137 : complex(kind=ck8), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
138 : #else
139 : complex(kind=ck4), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
140 : #endif
141 : #endif
142 : integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
143 : integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
144 : integer(kind=ik) :: l_cols, l_rows, l_colh, nstor
145 : integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
146 : integer(kind=ik) :: hvn_ubnd, hvm_ubnd
147 :
148 : #if REALCASE == 1
149 8064 : real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
150 8064 : real(kind=REAL_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
151 : #endif
152 : #if COMPLEXCASE == 1
153 8064 : complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
154 8064 : complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
155 : #endif
156 : integer(kind=ik) :: istat
157 : character(200) :: errorMessage
158 :
159 : integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
160 : logical :: successCUDA
161 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
162 : &PRECISION&
163 : &_&
164 : &MATH_DATATYPE
165 :
166 : call obj%timer%start("trans_ev_&
167 : &MATH_DATATYPE&
168 : &" // &
169 : &PRECISION_SUFFIX &
170 8064 : )
171 :
172 8064 : call obj%timer%start("mpi_communication")
173 8064 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
174 8064 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
175 8064 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
176 8064 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
177 8064 : call obj%timer%stop("mpi_communication")
178 :
179 8064 : totalblocks = (na-1)/nblk + 1
180 8064 : max_blocks_row = (totalblocks-1)/np_rows + 1
181 8064 : max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat!
182 :
183 8064 : max_local_rows = max_blocks_row*nblk
184 8064 : max_local_cols = max_blocks_col*nblk
185 :
186 8064 : max_stored_rows = (63/nblk+1)*nblk
187 :
188 8064 : allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
189 : call check_alloc("trans_ev_&
190 : &MATH_DATATYPE&
191 8064 : &", "tmat", istat, errorMessage)
192 :
193 8064 : allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
194 : call check_alloc("trans_ev_&
195 : &MATH_DATATYPE&
196 8064 : &", "h1", istat, errorMessage)
197 :
198 8064 : allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
199 : call check_alloc("trans_ev_&
200 : &MATH_DATATYPE&
201 8064 : &", "h2", istat, errorMessage)
202 :
203 8064 : allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
204 : call check_alloc("trans_ev_&
205 : &MATH_DATATYPE&
206 8064 : &", "tmp1", istat, errorMessage)
207 :
208 8064 : allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
209 : call check_alloc("trans_ev_&
210 : &MATH_DATATYPE&
211 8064 : &", "tmp2", istat, errorMessage)
212 :
213 8064 : allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
214 : call check_alloc("trans_ev_&
215 : &MATH_DATATYPE&
216 8064 : &", "hvn", istat, errorMessage)
217 :
218 8064 : allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
219 : call check_alloc("trans_ev_&
220 : &MATH_DATATYPE&
221 8064 : &", "hvm", istat, errorMessage)
222 :
223 8064 : hvm = 0 ! Must be set to 0 !!!
224 8064 : hvb = 0 ! Safety only
225 :
226 8064 : l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
227 :
228 8064 : nstor = 0
229 8064 : if (useGPU) then
230 0 : hvn_ubnd = 0
231 : endif
232 :
233 : #if COMPLEXCASE == 1
234 : ! In the complex case tau(2) /= 0
235 4032 : if (my_prow == prow(1, nblk, np_rows)) then
236 2688 : q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
237 : endif
238 : #endif
239 :
240 8064 : if (useGPU) then
241 : ! todo: this is used only for copying hmv to device.. it should be possible to go without it
242 0 : allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
243 : call check_alloc("trans_ev_&
244 : &MATH_DATATYPE&
245 0 : &", "hvm1", istat, errorMessage)
246 :
247 0 : successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
248 0 : check_alloc_cuda("trans_ev", successCUDA)
249 :
250 0 : successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
251 0 : check_alloc_cuda("trans_ev", successCUDA)
252 :
253 0 : successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
254 0 : check_alloc_cuda("trans_ev", successCUDA)
255 :
256 0 : successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_datatype)
257 0 : check_alloc_cuda("trans_ev", successCUDA)
258 :
259 : ! q_dev = q_mat
260 0 : successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
261 0 : check_memcpy_cuda("trans_ev", successCUDA)
262 : endif ! useGPU
263 :
264 165024 : do istep = 1, na, nblk
265 156960 : ics = MAX(istep,3)
266 156960 : ice = MIN(istep+nblk-1,na)
267 156960 : if (ice<ics) cycle
268 :
269 156960 : cur_pcol = pcol(istep, nblk, np_cols)
270 :
271 156960 : nb = 0
272 2574432 : do ic = ics, ice
273 :
274 2417472 : l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder Vector
275 2417472 : l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
276 :
277 :
278 2417472 : if (my_pcol == cur_pcol) then
279 2417472 : hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
280 2417472 : if (my_prow == prow(ic-1, nblk, np_rows)) then
281 1611648 : hvb(nb+l_rows) = 1.
282 : endif
283 : endif
284 :
285 2417472 : nb = nb+l_rows
286 : enddo
287 :
288 : #ifdef WITH_MPI
289 104640 : call obj%timer%start("mpi_communication")
290 104640 : if (nb>0) &
291 : call MPI_Bcast(hvb, nb, &
292 : #if REALCASE == 1
293 : &MPI_REAL_PRECISION&
294 : #endif
295 :
296 : #if COMPLEXCASE == 1
297 : &MPI_COMPLEX_PRECISION&
298 : #endif
299 101952 : , cur_pcol, mpi_comm_cols, mpierr)
300 104640 : call obj%timer%stop("mpi_communication")
301 : #endif /* WITH_MPI */
302 :
303 156960 : nb = 0
304 2574432 : do ic = ics, ice
305 2417472 : l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
306 2417472 : hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
307 2417472 : if (useGPU) then
308 0 : hvm_ubnd = l_rows
309 : endif
310 2417472 : nstor = nstor+1
311 2417472 : nb = nb+l_rows
312 : enddo
313 :
314 : ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
315 156960 : if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then
316 :
317 : ! Calculate scalar products of stored vectors.
318 : ! This can be done in different ways, we use dsyrk or zherk
319 :
320 56160 : tmat = 0
321 56160 : call obj%timer%start("blas")
322 56160 : if (l_rows>0) &
323 : #if REALCASE == 1
324 : call PRECISION_SYRK('U', 'T', &
325 : #endif
326 : #if COMPLEXCASE == 1
327 : call PRECISION_HERK('U', 'C', &
328 : #endif
329 56160 : nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
330 56160 : call obj%timer%stop("blas")
331 56160 : nc = 0
332 2417472 : do n = 1, nstor-1
333 2361312 : h1(nc+1:nc+n) = tmat(1:n,n+1)
334 2361312 : nc = nc+n
335 : enddo
336 : #ifdef WITH_MPI
337 37440 : call obj%timer%start("mpi_communication")
338 37440 : if (nc>0) call mpi_allreduce( h1, h2, nc, &
339 : #if REALCASE == 1
340 : &MPI_REAL_PRECISION&
341 : #endif
342 : #if COMPLEXCASE == 1
343 : &MPI_COMPLEX_PRECISION&
344 : #endif
345 37440 : &, MPI_SUM, mpi_comm_rows, mpierr)
346 37440 : call obj%timer%stop("mpi_communication")
347 : #else /* WITH_MPI */
348 :
349 18720 : if (nc > 0) h2 = h1
350 :
351 : #endif /* WITH_MPI */
352 : ! Calculate triangular matrix T
353 :
354 56160 : nc = 0
355 56160 : tmat(1,1) = tau(ice-nstor+1)
356 2417472 : do n = 1, nstor-1
357 2361312 : call obj%timer%start("blas")
358 : #if REALCASE == 1
359 : call PRECISION_TRMV('L', 'T', 'N', &
360 : #endif
361 : #if COMPLEXCASE == 1
362 : call PRECISION_TRMV('L', 'C', 'N', &
363 : #endif
364 2361312 : n, tmat, max_stored_rows, h2(nc+1), 1)
365 2361312 : call obj%timer%stop("blas")
366 :
367 : tmat(n+1,1:n) = &
368 : #if REALCASE == 1
369 : -h2(nc+1:nc+n) &
370 : #endif
371 : #if COMPLEXCASE == 1
372 : -conjg(h2(nc+1:nc+n)) &
373 : #endif
374 2361312 : *tau(ice-nstor+n+1)
375 :
376 2361312 : tmat(n+1,n+1) = tau(ice-nstor+n+1)
377 2361312 : nc = nc+n
378 : enddo
379 :
380 56160 : if (useGPU) then
381 : ! todo: is this reshape really neccessary?
382 0 : hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
383 :
384 : !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
385 : successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
386 0 : hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
387 :
388 0 : check_memcpy_cuda("trans_ev", successCUDA)
389 :
390 : !tmat_dev = tmat
391 : successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
392 0 : max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
393 0 : check_memcpy_cuda("trans_ev", successCUDA)
394 : endif
395 :
396 : ! Q = Q - V * T * V**T * Q
397 :
398 56160 : if (l_rows>0) then
399 56160 : if (useGPU) then
400 0 : call obj%timer%start("cublas")
401 : #if REALCASE == 1
402 : call cublas_PRECISION_GEMM('T', 'N', &
403 : #endif
404 : #if COMPLEXCASE == 1
405 : call cublas_PRECISION_GEMM('C', 'N', &
406 : #endif
407 : nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, &
408 0 : q_dev, ldq, ZERO, tmp_dev, nstor)
409 0 : call obj%timer%stop("cublas")
410 :
411 : else ! useGPU
412 :
413 56160 : call obj%timer%start("blas")
414 : #if REALCASE == 1
415 : call PRECISION_GEMM('T', 'N', &
416 : #endif
417 : #if COMPLEXCASE == 1
418 : call PRECISION_GEMM('C', 'N', &
419 : #endif
420 : nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
421 56160 : q_mat, ldq, ZERO, tmp1, nstor)
422 56160 : call obj%timer%stop("blas")
423 : endif ! useGPU
424 :
425 : else !l_rows>0
426 :
427 0 : if (useGPU) then
428 0 : successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
429 0 : check_memcpy_cuda("trans_ev", successCUDA)
430 : else
431 0 : tmp1(1:l_cols*nstor) = 0
432 : endif
433 : endif !l_rows>0
434 :
435 : #ifdef WITH_MPI
436 : ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
437 : ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
438 37440 : if (useGPU) then
439 : successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
440 0 : max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
441 0 : check_memcpy_cuda("trans_ev", successCUDA)
442 : endif
443 37440 : call obj%timer%start("mpi_communication")
444 : call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
445 : #if REALCASE == 1
446 : &MPI_REAL_PRECISION&
447 : #endif
448 : #if COMPLEXCASE == 1
449 : &MPI_COMPLEX_PRECISION&
450 : #endif
451 37440 : &, MPI_SUM, mpi_comm_rows, mpierr)
452 37440 : call obj%timer%stop("mpi_communication")
453 : ! copy back tmp2 - after reduction...
454 37440 : if (useGPU) then
455 : successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
456 0 : max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
457 0 : check_memcpy_cuda("trans_ev", successCUDA)
458 : endif ! useGPU
459 :
460 :
461 : #else /* WITH_MPI */
462 : ! tmp2 = tmp1
463 : #endif /* WITH_MPI */
464 :
465 56160 : if (l_rows>0) then
466 56160 : if (useGPU) then
467 0 : call obj%timer%start("cublas")
468 : call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', &
469 : nstor, l_cols, ONE, tmat_dev, max_stored_rows, &
470 0 : tmp_dev, nstor)
471 :
472 : call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
473 : -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, &
474 0 : ONE, q_dev, ldq)
475 0 : call obj%timer%stop("cublas")
476 : else !useGPU
477 : #ifdef WITH_MPI
478 : ! tmp2 = tmat * tmp2
479 37440 : call obj%timer%start("blas")
480 : call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
481 37440 : ONE, tmat, max_stored_rows, tmp2, nstor)
482 : !q_mat = q_mat - hvm*tmp2
483 : call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
484 37440 : -ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
485 37440 : call obj%timer%stop("blas")
486 : #else /* WITH_MPI */
487 18720 : call obj%timer%start("blas")
488 :
489 : call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
490 18720 : ONE, tmat, max_stored_rows, tmp1, nstor)
491 : call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
492 18720 : -ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
493 18720 : call obj%timer%stop("blas")
494 : #endif /* WITH_MPI */
495 : endif ! useGPU
496 : endif ! l_rows>0
497 56160 : nstor = 0
498 : endif ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))
499 :
500 : enddo ! istep=1,na,nblk
501 :
502 8064 : deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
503 8064 : if (istat .ne. 0) then
504 : print *,"trans_ev_&
505 : &MATH_DATATYPE&
506 0 : &: error when deallocating hvm "//errorMessage
507 0 : stop 1
508 : endif
509 :
510 8064 : if (useGPU) then
511 : !q_mat = q_dev
512 0 : successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
513 0 : check_memcpy_cuda("trans_ev", successCUDA)
514 :
515 0 : deallocate(hvm1, stat=istat, errmsg=errorMessage)
516 0 : if (istat .ne. 0) then
517 : print *,"trans_ev_&
518 : &MATH_DATATYPE&
519 0 : &: error when deallocating hvm1 "//errorMessage
520 0 : stop 1
521 : endif
522 :
523 : !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
524 0 : successCUDA = cuda_free(q_dev)
525 0 : check_dealloc_cuda("trans_ev", successCUDA)
526 :
527 0 : successCUDA = cuda_free(tmp_dev)
528 0 : check_dealloc_cuda("trans_ev", successCUDA)
529 :
530 0 : successCUDA = cuda_free(hvm_dev)
531 0 : check_dealloc_cuda("trans_ev", successCUDA)
532 :
533 0 : successCUDA = cuda_free(tmat_dev)
534 0 : check_dealloc_cuda("trans_ev", successCUDA)
535 :
536 : endif
537 :
538 : call obj%timer%stop("trans_ev_&
539 : &MATH_DATATYPE&
540 : &" // &
541 : &PRECISION_SUFFIX&
542 8064 : )
543 :
544 : end subroutine trans_ev_&
545 : &MATH_DATATYPE&
546 : &_&
547 8064 : &PRECISION
|