Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : !
47 : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48 : !
49 : ! Copyright of the original code rests with the authors inside the ELPA
50 : ! consortium. The copyright of any additional modifications shall rest
51 : ! with their original authors, but shall adhere to the licensing terms
52 : ! distributed along with the original code in the file "COPYING".
53 : #endif
54 :
55 : #include "../general/sanity.F90"
56 :
57 : #undef SAVE_MATR
58 : #ifdef DOUBLE_PRECISION_REAL
59 : #define SAVE_MATR(name, iteration) \
60 : call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
61 : #else
62 : #define SAVE_MATR(name, iteration)
63 : #endif
64 :
65 : !> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
66 : !>
67 : ! Parameters
68 : !
69 : !> \param obj object of elpa_type
70 : !> \param na Order of matrix
71 : !>
72 : !> \param a_mat(lda,matrixCols) Distributed matrix which should be reduced.
73 : !> Distribution is like in Scalapack.
74 : !> Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
75 : !> a(:,:) is overwritten on exit with the Householder vectors
76 : !>
77 : !> \param lda Leading dimension of a
78 : !>
79 : !> \param nblk blocksize of cyclic distribution, must be the same in both directions!
80 : !>
81 : !> \param matrixCols local columns of matrix
82 : !>
83 : !> \param mpi_comm_rows MPI-Communicator for rows
84 : !> \param mpi_comm_cols MPI-Communicator for columns
85 : !>
86 : !> \param d_vec(na) Diagonal elements (returned), identical on all processors
87 : !>
88 : !> \param e_vec(na) Off-Diagonal elements (returned), identical on all processors
89 : !>
90 : !> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
91 : !>
92 : !> \param useGPU If true, GPU version of the subroutine will be used
93 : !> \param wantDebug if true more debug information
94 : !>
95 : subroutine tridiag_&
96 : &MATH_DATATYPE&
97 : &_&
98 9216 : &PRECISION &
99 9216 : (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug)
100 : use cuda_functions
101 : use iso_c_binding
102 : use precision
103 : use elpa_abstract_impl
104 : use matrix_plot
105 : implicit none
106 : #include "../general/precision_kinds.F90"
107 : class(elpa_abstract_impl_t), intent(inout) :: obj
108 : integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
109 : logical, intent(in) :: useGPU, wantDebug
110 :
111 : #if REALCASE == 1
112 : real(kind=REAL_DATATYPE), intent(out) :: tau(na)
113 : #endif
114 : #if COMPLEXCASE == 1
115 : complex(kind=COMPLEX_DATATYPE), intent(out) :: tau(na)
116 : #endif
117 : #if REALCASE == 1
118 : #ifdef USE_ASSUMED_SIZE
119 : real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*)
120 : #else
121 : real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
122 : #endif
123 : #endif
124 : #if COMPLEXCASE == 1
125 : #ifdef USE_ASSUMED_SIZE
126 : complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
127 : #else
128 : complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
129 : #endif
130 : #endif
131 : real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
132 :
133 : integer(kind=ik), parameter :: max_stored_uv = 32
134 : logical, parameter :: mat_vec_as_one_block = .true.
135 :
136 : ! id in processor row and column and total numbers of processor rows and columns
137 : integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
138 : integer(kind=ik) :: mpierr
139 : integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
140 : max_local_cols
141 : ! updated after each istep (in the main cycle) to contain number of
142 : ! local columns and rows of the remaining part of the matrix
143 : !integer(kind=ik) :: l_cols, l_rows
144 : integer(kind=ik) :: l_cols, l_rows
145 : integer(kind=ik) :: n_stored_vecs
146 :
147 : integer(kind=C_intptr_T) :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
148 : uv_stored_cols_dev
149 : logical :: successCUDA
150 :
151 : integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
152 : integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
153 : integer(kind=c_intptr_t) :: a_offset
154 :
155 : #ifdef WITH_OPENMP
156 : integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter
157 : integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
158 : #endif
159 :
160 : real(kind=REAL_DATATYPE) :: vnorm2
161 : #if REALCASE == 1
162 : real(kind=REAL_DATATYPE) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
163 : #endif
164 : #if COMPLEXCASE == 1
165 : complex(kind=COMPLEX_DATATYPE) :: vav, xc, aux(2*max_stored_uv), aux1(2), aux2(2), aux3(1), vrl, xf
166 : #endif
167 :
168 : #if REALCASE == 1
169 4752 : real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
170 4752 : v_row(:), & ! used to store calculated Householder Vector
171 4752 : v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
172 4752 : u_row(:), &
173 4752 : u_col(:)
174 : #endif
175 : #if COMPLEXCASE == 1
176 13392 : complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
177 : #endif
178 : ! the following two matrices store pairs of vectors v and u calculated in each step
179 : ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
180 : ! u and v are stored both in row and Vector forms
181 : ! pattern: v1,u1,v2,u2,v3,u3,....
182 : ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
183 : #if REALCASE == 1
184 4752 : real(kind=REAL_DATATYPE), allocatable :: vu_stored_rows(:,:)
185 : ! pattern: u1,v1,u2,v2,u3,v3,....
186 4752 : real(kind=REAL_DATATYPE), allocatable :: uv_stored_cols(:,:)
187 : #endif
188 : #if COMPLEXCASE == 1
189 8928 : complex(kind=COMPLEX_DATATYPE), allocatable :: vu_stored_rows(:,:), uv_stored_cols(:,:)
190 : #endif
191 :
192 : #ifdef WITH_OPENMP
193 : #if REALCASE == 1
194 2376 : real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
195 : #endif
196 : #if COMPLEXCASE == 1
197 2232 : complex(kind=COMPLEX_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
198 : #endif
199 : #endif
200 :
201 : #if COMPLEXCASE == 1
202 4464 : real(kind=REAL_DATATYPE), allocatable :: tmp_real(:)
203 : #endif
204 : integer(kind=ik) :: istat
205 : character(200) :: errorMessage
206 : integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
207 : &PRECISION&
208 : &_&
209 : &MATH_DATATYPE
210 : call obj%timer%start("tridiag_&
211 : &MATH_DATATYPE&
212 : &" // &
213 : PRECISION_SUFFIX &
214 9216 : )
215 :
216 :
217 9216 : if (wantDebug) call obj%timer%start("mpi_communication")
218 9216 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
219 9216 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
220 9216 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
221 9216 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
222 9216 : if (wantDebug) call obj%timer%stop("mpi_communication")
223 :
224 : ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
225 : ! seems that tile is a square submatrix, consisting by several blocks
226 : ! it is a smallest possible square submatrix, where blocks being distributed among
227 : ! processors are "aligned" in both rows and columns
228 : ! -----------------
229 : ! | 1 4 | 1 4 | 1 4 | ...
230 : ! | 2 5 | 2 5 | 2 5 | ...
231 : ! | 3 6 | 3 6 | 3 6 | ...
232 : ! ----------------- ...
233 : ! | 1 4 | 1 4 | 1 4 | ...
234 : ! | 2 5 | 2 5 | 2 5 | ...
235 : ! | 3 6 | 3 6 | 3 6 | ...
236 : ! ----------------- .
237 : ! : : : : : : .
238 : ! : : : : : : .
239 : !
240 : ! this is a tile, where each number represents block, assigned to a processor with the shown number
241 : ! size of this small block is nblk
242 : ! Image is for situation with 6 processors, 3 processor rows and 2 columns
243 : ! tile_size is thus nblk * 6
244 : !
245 9216 : tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
246 9216 : tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
247 :
248 9216 : l_rows_per_tile = tile_size/np_rows ! local rows of a tile
249 9216 : l_cols_per_tile = tile_size/np_cols ! local cols of a tile
250 :
251 9216 : totalblocks = (na-1)/nblk + 1
252 9216 : max_loc_block_rows = (totalblocks-1)/np_rows + 1
253 9216 : max_loc_block_cols = (totalblocks-1)/np_cols + 1
254 :
255 : ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
256 9216 : max_local_rows = max_loc_block_rows*nblk
257 9216 : max_local_cols = max_loc_block_cols*nblk
258 :
259 : ! allocate memmory for vectors
260 : ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
261 : ! todo: if something has length max_local_rows, it is actually a column, no?
262 : ! todo: probably one should read it as v_row = Vector v distributed among rows
263 : !
264 9216 : allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
265 : call check_alloc("tridiag_&
266 9216 : &MATH_DATATYPE ", "tmp", istat, errorMessage)
267 :
268 : ! allocate v_row 1 element longer to allow store and broadcast tau together with it
269 9216 : allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
270 : call check_alloc("tridiag_&
271 9216 : &MATH_DATATYPE ", "v_row", istat, errorMessage)
272 :
273 9216 : allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
274 : call check_alloc("tridiag_&
275 9216 : &MATH_DATATYPE ", "u_row", istat, errorMessage)
276 :
277 9216 : allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
278 : call check_alloc("tridiag_&
279 9216 : &MATH_DATATYPE ", "v_col", istat, errorMessage)
280 :
281 9216 : allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
282 : call check_alloc("tridiag_&
283 9216 : &MATH_DATATYPE ", "u_col", istat, errorMessage)
284 :
285 : #ifdef WITH_OPENMP
286 4608 : max_threads = omp_get_max_threads()
287 :
288 4608 : allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
289 : call check_alloc("tridiag_&
290 4608 : &MATH_DATATYPE ", "ur_p", istat, errorMessage)
291 :
292 4608 : allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
293 : call check_alloc("tridiag_&
294 4608 : &MATH_DATATYPE ", "uc_p", istat, errorMessage)
295 : #endif /* WITH_OPENMP */
296 :
297 9216 : tmp = 0
298 9216 : v_row = 0
299 9216 : u_row = 0
300 9216 : v_col = 0
301 9216 : u_col = 0
302 :
303 9216 : allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
304 : call check_alloc("tridiag_&
305 9216 : &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
306 :
307 9216 : allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
308 : call check_alloc("tridiag_&
309 9216 : &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
310 :
311 9216 : if (useGPU) then
312 0 : successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
313 0 : check_alloc_cuda("tridiag: v_row_dev", successCUDA)
314 :
315 0 : successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
316 :
317 0 : check_alloc_cuda("tridiag: u_row_dev", successCUDA)
318 :
319 0 : successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
320 0 : check_alloc_cuda("tridiag: v_col_dev", successCUDA)
321 :
322 0 : successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
323 0 : check_alloc_cuda("tridiag: u_col_dev", successCUDA)
324 :
325 0 : successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
326 0 : check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
327 :
328 0 : successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
329 0 : check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
330 : endif !useGPU
331 :
332 :
333 9216 : d_vec(:) = 0
334 9216 : e_vec(:) = 0
335 9216 : tau(:) = 0
336 :
337 9216 : n_stored_vecs = 0
338 :
339 9216 : l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
340 9216 : l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
341 :
342 9216 : if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
343 : #if COMPLEXCASE == 1
344 2976 : d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
345 : #endif
346 : #if REALCASE == 1
347 3168 : d_vec(na) = a_mat(l_rows,l_cols)
348 : #endif
349 :
350 9216 : if (useGPU) then
351 : ! allocate memmory for matrix A on the device and than copy the matrix
352 :
353 0 : successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
354 0 : check_alloc_cuda("tridiag: a_dev", successCUDA)
355 :
356 0 : successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
357 0 : check_memcpy_cuda("tridiag: a_dev", successCUDA)
358 : endif
359 :
360 : ! main cycle of tridiagonalization
361 : ! in each step, 1 Householder Vector is calculated
362 2597184 : do istep = na, 3 ,-1
363 :
364 : ! Calculate number of local rows and columns of the still remaining matrix
365 : ! on the local processor
366 2587968 : l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
367 2587968 : l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
368 :
369 : ! Calculate Vector for Householder transformation on all procs
370 : ! owning column istep
371 :
372 2587968 : if (my_pcol == pcol(istep, nblk, np_cols)) then
373 :
374 : ! Get Vector to be transformed; distribute last element and norm of
375 : ! remaining elements to all procs in current column
376 :
377 : ! copy l_cols + 1 column of A to v_row
378 2587968 : if (useGPU) then
379 0 : a_offset = l_cols * lda * size_of_datatype
380 : ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
381 :
382 0 : successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
383 0 : check_memcpy_cuda("tridiag a_dev 1", successCUDA)
384 : else
385 2587968 : v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
386 : endif
387 :
388 2587968 : if (n_stored_vecs > 0 .and. l_rows > 0) then
389 2457408 : if (wantDebug) call obj%timer%start("blas")
390 : #if COMPLEXCASE == 1
391 1208832 : aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
392 : #endif
393 : call PRECISION_GEMV('N', &
394 : l_rows, 2*n_stored_vecs, &
395 : ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
396 : #if REALCASE == 1
397 : uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
398 : #endif
399 : #if COMPLEXCASE == 1
400 : aux, 1, &
401 :
402 : #endif
403 2457408 : ONE, v_row, 1)
404 2457408 : if (wantDebug) call obj%timer%stop("blas")
405 :
406 : endif
407 :
408 2587968 : if(my_prow == prow(istep-1, nblk, np_rows)) then
409 1725312 : aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
410 1725312 : aux1(2) = v_row(l_rows)
411 : else
412 862656 : aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
413 862656 : aux1(2) = 0.
414 : endif
415 :
416 : #ifdef WITH_MPI
417 1725312 : if (wantDebug) call obj%timer%start("mpi_communication")
418 : call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
419 1725312 : MPI_SUM, mpi_comm_rows, mpierr)
420 1725312 : if (wantDebug) call obj%timer%stop("mpi_communication")
421 : #else /* WITH_MPI */
422 862656 : aux2 = aux1
423 : #endif /* WITH_MPI */
424 :
425 : #if REALCASE == 1
426 1315296 : vnorm2 = aux2(1)
427 : #endif
428 : #if COMPLEXCASE == 1
429 1272672 : vnorm2 = real(aux2(1),kind=rk)
430 : #endif
431 2587968 : vrl = aux2(2)
432 :
433 : ! Householder transformation
434 : #if REALCASE == 1
435 : call hh_transform_real_&
436 : #endif
437 : #if COMPLEXCASE == 1
438 : call hh_transform_complex_&
439 : #endif
440 : &PRECISION &
441 2587968 : (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
442 : ! Scale v_row and store Householder Vector for back transformation
443 :
444 2587968 : v_row(1:l_rows) = v_row(1:l_rows) * xf
445 2587968 : if (my_prow == prow(istep-1, nblk, np_rows)) then
446 1725312 : v_row(l_rows) = 1.
447 :
448 : ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
449 : #if REALCASE == 1
450 876864 : e_vec(istep-1) = vrl
451 : #endif
452 : #if COMPLEXCASE == 1
453 848448 : e_vec(istep-1) = real(vrl,kind=rk)
454 : #endif
455 : endif
456 :
457 : ! store Householder Vector for back transformation
458 2587968 : a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
459 :
460 : ! add tau after the end of actuall v_row, to be broadcasted with it
461 2587968 : v_row(l_rows+1) = tau(istep)
462 : endif !(my_pcol == pcol(istep, nblk, np_cols))
463 :
464 : ! SAVE_MATR("HH vec stored", na - istep + 1)
465 :
466 : #ifdef WITH_MPI
467 1725312 : if (wantDebug) call obj%timer%start("mpi_communication")
468 : ! Broadcast the Householder Vector (and tau) along columns
469 : call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
470 1725312 : pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
471 1725312 : if (wantDebug) call obj%timer%stop("mpi_communication")
472 : #endif /* WITH_MPI */
473 :
474 : !recover tau, which has been broadcasted together with v_row
475 2587968 : tau(istep) = v_row(l_rows+1)
476 :
477 : ! Transpose Householder Vector v_row -> v_col
478 : call elpa_transpose_vectors_&
479 : &MATH_DATATYPE&
480 : &_&
481 : &PRECISION &
482 : (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
483 2587968 : 1, istep-1, 1, nblk)
484 :
485 : ! Calculate u = (A + VU**T + UV**T)*v
486 :
487 : ! For cache efficiency, we use only the upper half of the matrix tiles for this,
488 : ! thus the result is partly in u_col(:) and partly in u_row(:)
489 :
490 2587968 : u_col(1:l_cols) = 0
491 2587968 : u_row(1:l_rows) = 0
492 2587968 : if (l_rows > 0 .and. l_cols> 0 ) then
493 2541888 : if(useGPU) then
494 0 : successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
495 0 : check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
496 :
497 0 : successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
498 0 : check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
499 :
500 0 : successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
501 :
502 0 : check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
503 :
504 0 : successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
505 0 : check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
506 : endif ! useGU
507 :
508 : #ifdef WITH_OPENMP
509 1270944 : call obj%timer%start("OpenMP parallel")
510 2541888 : !$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
511 :
512 1270944 : my_thread = omp_get_thread_num()
513 1270944 : n_threads = omp_get_num_threads()
514 :
515 1270944 : n_iter = 0
516 :
517 : ! first calculate A*v part of (A + VU**T + UV**T)*v
518 404147376 : uc_p(1:l_cols,my_thread) = 0.
519 269993472 : ur_p(1:l_rows,my_thread) = 0.
520 : #endif /* WITH_OPENMP */
521 8177088 : do i= 0, (istep-2)/tile_size
522 5635200 : l_col_beg = i*l_cols_per_tile+1
523 7181856 : l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
524 5635200 : if (l_col_end < l_col_beg) cycle
525 18463392 : do j = 0, i
526 12828192 : l_row_beg = j*l_rows_per_tile+1
527 17948304 : l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
528 12828192 : if (l_row_end < l_row_beg) cycle
529 : #ifdef WITH_OPENMP
530 6402576 : if (mod(n_iter,n_threads) == my_thread) then
531 6402576 : if (wantDebug) call obj%timer%start("blas")
532 : call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
533 : l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
534 : ONE, a_mat(l_row_beg,l_col_beg), lda, &
535 6402576 : v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)
536 :
537 6402576 : if (i/=j) then
538 : call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
539 : ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1, &
540 :
541 3596496 : ONE, ur_p(l_row_beg,my_thread), 1)
542 : endif
543 6402576 : if (wantDebug) call obj%timer%stop("blas")
544 : endif
545 6402576 : n_iter = n_iter+1
546 : #else /* WITH_OPENMP */
547 :
548 : ! multiplication by blocks is efficient only for CPU
549 : ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
550 : ! CPU implementation) or by one large matrix Vector multiply
551 6402576 : if (.not. useGPU) then
552 6402576 : if (wantDebug) call obj%timer%start("blas")
553 : call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
554 : l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
555 : ONE, a_mat(l_row_beg, l_col_beg), lda, &
556 : v_row(l_row_beg), 1, &
557 6402576 : ONE, u_col(l_col_beg), 1)
558 :
559 6402576 : if (i/=j) then
560 :
561 : call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
562 : ONE, a_mat(l_row_beg,l_col_beg), lda, &
563 : v_col(l_col_beg), 1, &
564 3596496 : ONE, u_row(l_row_beg), 1)
565 : endif
566 6402576 : if (wantDebug) call obj%timer%stop("blas")
567 : endif ! not useGPU
568 :
569 : #endif /* WITH_OPENMP */
570 : enddo ! j=0,i
571 : enddo ! i=0,(istep-2)/tile_size
572 :
573 2541888 : if (useGPU) then
574 : if(mat_vec_as_one_block) then
575 : ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
576 : ! this requires altering of the algorithm when later explicitly updating the matrix
577 : ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
578 0 : if (wantDebug) call obj%timer%start("cublas")
579 : call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
580 : ONE, a_dev, lda, &
581 : v_row_dev , 1, &
582 0 : ONE, u_col_dev, 1)
583 :
584 : ! todo: try with non transposed!!!
585 : ! if(i/=j) then
586 : ! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
587 : ! ONE, a_dev + a_offset, lda, &
588 : ! v_col_dev + (l_col_beg - 1) * &
589 : ! size_of_datatype, 1, &
590 : ! ONE, u_row_dev + (l_row_beg - 1) * &
591 : ! size_of_datatype, 1)
592 : ! endif
593 0 : if (wantDebug) call obj%timer%stop("cublas")
594 :
595 : else
596 : !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
597 : !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
598 : do i=0,(istep-2)/tile_size
599 : l_col_beg = i*l_cols_per_tile+1
600 : l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
601 : if(l_col_end<l_col_beg) cycle
602 :
603 : l_row_beg = 1
604 : l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
605 :
606 : a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
607 : size_of_datatype
608 :
609 : call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
610 : l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
611 : ONE, a_dev + a_offset, lda, &
612 : v_row_dev + (l_row_beg - 1) * size_of_datatype, 1, &
613 : ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
614 : enddo
615 :
616 : do i=0,(istep-2)/tile_size
617 : l_col_beg = i*l_cols_per_tile+1
618 : l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
619 : if(l_col_end<l_col_beg) cycle
620 :
621 : l_row_beg = 1
622 : l_row_end = min(l_rows,i*l_rows_per_tile)
623 :
624 : a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
625 : size_of_datatype
626 :
627 : call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
628 : ONE, a_dev + a_offset, lda, &
629 : v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
630 : ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
631 : enddo
632 : end if !multiplication as one block / per stripes
633 :
634 0 : successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
635 0 : check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
636 :
637 0 : successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
638 0 : check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
639 : endif
640 :
641 : ! call PRECISION_SYMV('U', l_cols, &
642 : ! 1.d0, a_mat, ubound(a_mat,1), &
643 : ! v_row, 1, &
644 : ! 0.d0, u_col, 1)
645 :
646 : ! endif ! useGPU
647 :
648 : #ifdef WITH_OPENMP
649 : !$OMP END PARALLEL
650 1270944 : call obj%timer%stop("OpenMP parallel")
651 :
652 2541888 : do i=0,max_threads-1
653 1270944 : u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
654 1270944 : u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
655 : enddo
656 : #endif /* WITH_OPENMP */
657 :
658 : ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
659 2541888 : if (n_stored_vecs > 0) then
660 2457408 : if (wantDebug) call obj%timer%start("blas")
661 : #if REALCASE == 1
662 : call PRECISION_GEMV('T', &
663 : #endif
664 : #if COMPLEXCASE == 1
665 : call PRECISION_GEMV('C', &
666 : #endif
667 : l_rows, 2*n_stored_vecs, &
668 : ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
669 2457408 : v_row, 1, ZERO, aux, 1)
670 :
671 : call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
672 : ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
673 2457408 : aux, 1, ONE, u_col, 1)
674 2457408 : if (wantDebug) call obj%timer%stop("blas")
675 : endif
676 :
677 : endif ! (l_rows>0 .and. l_cols>0)
678 :
679 : ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
680 : ! on the processors containing the diagonal
681 : ! This is only necessary if u_row has been calculated, i.e. if the
682 : ! global tile size is smaller than the global remaining matrix
683 :
684 2587968 : if (tile_size < istep-1) then
685 :
686 : call elpa_reduce_add_vectors_&
687 : &MATH_DATATYPE&
688 : &_&
689 : &PRECISION &
690 : (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
691 1185792 : mpi_comm_cols, istep-1, 1, nblk)
692 :
693 : endif
694 :
695 : ! Sum up all the u_col(:) parts, transpose u_col -> u_row
696 :
697 2587968 : if (l_cols>0) then
698 2587968 : tmp(1:l_cols) = u_col(1:l_cols)
699 : #ifdef WITH_MPI
700 1725312 : if (wantDebug) call obj%timer%start("mpi_communication")
701 : call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION, &
702 1725312 : MPI_SUM, mpi_comm_rows, mpierr)
703 1725312 : if (wantDebug) call obj%timer%stop("mpi_communication")
704 : #else /* WITH_MPI */
705 862656 : u_col = tmp
706 : #endif /* WITH_MPI */
707 : endif
708 :
709 : call elpa_transpose_vectors_&
710 : &MATH_DATATYPE&
711 : &_&
712 : &PRECISION &
713 : (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
714 2587968 : mpi_comm_rows, 1, istep-1, 1, nblk)
715 :
716 : ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
717 : #if REALCASE == 1
718 1315296 : x = 0
719 1315296 : if (l_cols>0) &
720 1315296 : x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
721 : #endif
722 : #if COMPLEXCASE == 1
723 1272672 : xc = 0
724 1272672 : if (l_cols>0) &
725 1272672 : xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
726 : #endif
727 :
728 : #ifdef WITH_MPI
729 1725312 : if (wantDebug) call obj%timer%start("mpi_communication")
730 : #if REALCASE == 1
731 876864 : call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
732 : #endif
733 : #if COMPLEXCASE == 1
734 848448 : call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
735 : #endif
736 1725312 : if (wantDebug) call obj%timer%stop("mpi_communication")
737 : #else /* WITH_MPI */
738 :
739 : #if REALCASE == 1
740 438432 : vav = x
741 : #endif
742 : #if COMPLEXCASE == 1
743 424224 : vav = xc
744 : #endif
745 :
746 : #endif /* WITH_MPI */
747 :
748 : ! store u and v in the matrices U and V
749 : ! these matrices are stored combined in one here
750 :
751 540033024 : do j=1,l_rows
752 : #if REALCASE == 1
753 269795232 : vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
754 269795232 : vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
755 : #endif
756 : #if COMPLEXCASE == 1
757 267649824 : vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
758 267649824 : vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
759 : #endif
760 : enddo
761 808755552 : do j=1,l_cols
762 : #if REALCASE == 1
763 404692848 : uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
764 404692848 : uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
765 : #endif
766 : #if COMPLEXCASE == 1
767 401474736 : uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
768 401474736 : uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
769 : #endif
770 : enddo
771 :
772 : ! We have calculated another Hauseholder Vector, number of implicitly stored increased
773 2587968 : n_stored_vecs = n_stored_vecs+1
774 :
775 : ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
776 2587968 : if (n_stored_vecs == max_stored_uv .or. istep == 3) then
777 :
778 84960 : if (useGPU) then
779 : successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
780 : max_local_rows * 2 * max_stored_uv * &
781 0 : size_of_datatype, cudaMemcpyHostToDevice)
782 0 : check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
783 :
784 : successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
785 : max_local_cols * 2 * max_stored_uv * &
786 0 : size_of_datatype, cudaMemcpyHostToDevice)
787 0 : check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
788 : endif
789 :
790 263520 : do i = 0, (istep-2)/tile_size
791 : ! go over tiles above (or on) the diagonal
792 178560 : l_col_beg = i*l_cols_per_tile+1
793 178560 : l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
794 178560 : l_row_beg = 1
795 178560 : l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
796 178560 : if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
797 : cycle
798 :
799 :
800 175008 : if (useGPU) then
801 : if(.not. mat_vec_as_one_block) then
802 : ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
803 : ! we than use the same calls as for CPU version
804 : if (wantDebug) call obj%timer%start("cublas")
805 : call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
806 : l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
807 : ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
808 : size_of_datatype, &
809 : max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) * &
810 : size_of_datatype, &
811 : max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * &
812 : size_of_datatype , lda)
813 : if (wantDebug) call obj%timer%stop("cublas")
814 : endif
815 : else !useGPU
816 175008 : if (wantDebug) call obj%timer%start("blas")
817 : call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
818 : l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
819 : ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
820 : uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
821 175008 : ONE, a_mat(l_row_beg,l_col_beg), lda)
822 175008 : if (wantDebug) call obj%timer%stop("blas")
823 : endif !useGPU
824 : enddo
825 :
826 84960 : if (useGPU) then
827 : if(mat_vec_as_one_block) then
828 : !update whole (remaining) part of matrix, including tiles below diagonal
829 : !we can do that in one large cublas call
830 0 : if (wantDebug) call obj%timer%start("cublas")
831 : call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
832 : ONE, vu_stored_rows_dev, max_local_rows, &
833 : uv_stored_cols_dev, max_local_cols, &
834 0 : ONE, a_dev, lda)
835 0 : if (wantDebug) call obj%timer%stop("cublas")
836 : endif
837 : endif
838 :
839 84960 : n_stored_vecs = 0
840 : endif
841 :
842 2587968 : if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
843 1725312 : if (useGPU) then
844 : !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
845 0 : a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
846 :
847 : successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
848 0 : 1 * size_of_datatype, cudaMemcpyDeviceToHost)
849 0 : check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
850 :
851 : endif
852 1725312 : if (n_stored_vecs > 0) then
853 : a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
854 1668672 : + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
855 : end if
856 : #if REALCASE == 1
857 876864 : d_vec(istep-1) = a_mat(l_rows,l_cols)
858 : #endif
859 : #if COMPLEXCASE == 1
860 848448 : d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk)
861 : #endif
862 :
863 1725312 : if (useGPU) then
864 : !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
865 : !successCUDA = cuda_threadsynchronize()
866 : !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
867 :
868 : successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
869 0 : int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
870 0 : check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
871 : endif
872 : endif
873 :
874 : enddo ! main cycle over istep=na,3,-1
875 :
876 : #if COMPLEXCASE == 1
877 : ! Store e_vec(1) and d_vec(1)
878 :
879 4464 : if (my_pcol==pcol(2, nblk, np_cols)) then
880 4464 : if (my_prow==prow(1, nblk, np_rows)) then
881 : ! We use last l_cols value of loop above
882 2976 : if(useGPU) then
883 : successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
884 0 : 1 * size_of_datatype, cudaMemcpyDeviceToHost)
885 0 : check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
886 0 : vrl = aux3(1)
887 : else !useGPU
888 2976 : vrl = a_mat(1,l_cols)
889 : endif !useGPU
890 : call hh_transform_complex_&
891 : &PRECISION &
892 2976 : (obj, vrl, 0.0_rk, xf, tau(2), wantDebug)
893 : #if REALCASE == 1
894 : e_vec(1) = vrl
895 : #endif
896 : #if COMPLEXCASE == 1
897 2976 : e_vec(1) = real(vrl,kind=rk)
898 : #endif
899 :
900 :
901 2976 : a_mat(1,l_cols) = 1. ! for consistency only
902 : endif
903 : #ifdef WITH_MPI
904 2976 : if (wantDebug) call obj%timer%start("mpi_communication")
905 2976 : call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
906 2976 : if (wantDebug) call obj%timer%stop("mpi_communication")
907 :
908 : #endif /* WITH_MPI */
909 : endif
910 :
911 : #ifdef WITH_MPI
912 2976 : if (wantDebug) call obj%timer%start("mpi_communication")
913 2976 : call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
914 2976 : if (wantDebug) call obj%timer%stop("mpi_communication")
915 :
916 : #endif /* WITH_MPI */
917 4464 : if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
918 2976 : if(useGPU) then
919 : successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
920 0 : 1 * size_of_datatype, cudaMemcpyDeviceToHost)
921 0 : check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
922 0 : d_vec(1) = PRECISION_REAL(aux3(1))
923 : else !useGPU
924 2976 : d_vec(1) = PRECISION_REAL(a_mat(1,1))
925 : endif !useGPU
926 : endif
927 :
928 : #endif /* COMPLEXCASE == 1 */
929 :
930 : #if REALCASE == 1
931 : ! Store e_vec(1)
932 :
933 4752 : if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
934 3168 : if(useGPU) then
935 : successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
936 0 : 1 * size_of_datatype, cudaMemcpyDeviceToHost)
937 0 : check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
938 : else !useGPU
939 3168 : e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
940 : endif !useGPU
941 : endif
942 :
943 : ! Store d_vec(1)
944 4752 : if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
945 3168 : if(useGPU) then
946 0 : successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
947 0 : check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
948 : else !useGPU
949 3168 : d_vec(1) = a_mat(1,1)
950 : endif !useGPU
951 : endif
952 : #endif
953 :
954 9216 : deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
955 9216 : if (istat .ne. 0) then
956 : #if REALCASE == 1
957 0 : print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
958 : #endif
959 : #if COMPLEXCASE == 1
960 0 : print *,"tridiag_complex: error when deallocating tmp "//errorMessage
961 : #endif
962 0 : stop 1
963 : endif
964 :
965 9216 : if (useGPU) then
966 : ! todo: should we leave a_mat on the device for further use?
967 0 : successCUDA = cuda_free(a_dev)
968 0 : check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
969 :
970 0 : successCUDA = cuda_free(v_row_dev)
971 0 : check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
972 :
973 0 : successCUDA = cuda_free(u_row_dev)
974 0 : check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
975 :
976 0 : successCUDA = cuda_free(v_col_dev)
977 0 : check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
978 :
979 0 : successCUDA = cuda_free(u_col_dev)
980 0 : check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
981 :
982 0 : successCUDA = cuda_free(vu_stored_rows_dev)
983 0 : check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
984 :
985 0 : successCUDA = cuda_free(uv_stored_cols_dev)
986 0 : check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
987 : endif
988 :
989 : ! distribute the arrays d_vec and e_vec to all processors
990 :
991 : #if REALCASE == 1
992 4752 : allocate(tmp(na), stat=istat, errmsg=errorMessage)
993 :
994 4752 : if (istat .ne. 0) then
995 0 : print *,"tridiag_real: error when allocating tmp "//errorMessage
996 0 : stop 1
997 : endif
998 : #endif
999 : #if COMPLEXCASE == 1
1000 4464 : allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
1001 4464 : if (istat .ne. 0) then
1002 0 : print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
1003 0 : stop 1
1004 : endif
1005 : #endif
1006 :
1007 :
1008 : #ifdef WITH_MPI
1009 6144 : if (wantDebug) call obj%timer%start("mpi_communication")
1010 : #if REALCASE == 1
1011 3168 : tmp = d_vec
1012 3168 : call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1013 3168 : tmp = d_vec
1014 3168 : call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
1015 3168 : tmp = e_vec
1016 3168 : call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1017 3168 : tmp = e_vec
1018 3168 : call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
1019 : #endif
1020 : #if COMPLEXCASE == 1
1021 2976 : tmp_real = d_vec
1022 2976 : call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1023 2976 : tmp_real = d_vec
1024 2976 : call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
1025 2976 : tmp_real = e_vec
1026 2976 : call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1027 2976 : tmp_real = e_vec
1028 2976 : call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
1029 : #endif
1030 6144 : if (wantDebug) call obj%timer%stop("mpi_communication")
1031 : #endif /* WITH_MPI */
1032 :
1033 : #if REALCASE == 1
1034 4752 : deallocate(tmp, stat=istat, errmsg=errorMessage)
1035 4752 : if (istat .ne. 0) then
1036 0 : print *,"tridiag_real: error when deallocating tmp "//errorMessage
1037 0 : stop 1
1038 : endif
1039 : #endif
1040 : #if COMPLEXCASE == 1
1041 4464 : deallocate(tmp_real, stat=istat, errmsg=errorMessage)
1042 4464 : if (istat .ne. 0) then
1043 0 : print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
1044 0 : stop 1
1045 : endif
1046 : #endif
1047 :
1048 : call obj%timer%stop("tridiag_&
1049 : &MATH_DATATYPE&
1050 : &" // &
1051 : &PRECISION_SUFFIX &
1052 9216 : )
1053 : end subroutine tridiag_&
1054 : &MATH_DATATYPE&
1055 : &_&
1056 9216 : &PRECISION
|