Line data Source code
1 : ! This file is part of ELPA.
2 : !
3 : ! The ELPA library was originally created by the ELPA consortium,
4 : ! consisting of the following organizations:
5 : !
6 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : ! This particular source code file contains additions, changes and
19 : ! enhancements authored by Intel Corporation which is not part of
20 : ! the ELPA consortium.
21 : !
22 : ! More information can be found here:
23 : ! http://elpa.mpcdf.mpg.de/
24 : !
25 : ! ELPA is free software: you can redistribute it and/or modify
26 : ! it under the terms of the version 3 of the license of the
27 : ! GNU Lesser General Public License as published by the Free
28 : ! Software Foundation.
29 : !
30 : ! ELPA is distributed in the hope that it will be useful,
31 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
32 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 : ! GNU Lesser General Public License for more details.
34 : !
35 : ! You should have received a copy of the GNU Lesser General Public License
36 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 : !
38 : ! ELPA reflects a substantial effort on the part of the original
39 : ! ELPA consortium, and we ask you to respect the spirit of the
40 : ! license that we chose: i.e., please contribute any changes you
41 : ! may have back to the original ELPA library distribution, and keep
42 : ! any derivatives of ELPA under the same license that we chose for
43 : ! the original distribution, the GNU Lesser General Public License.
44 :
45 : #include "../general/sanity.F90"
46 : use elpa1_compute
47 : use elpa_utilities
48 : use elpa_mpi
49 : use precision
50 : use elpa_abstract_impl
51 : implicit none
52 : #include "../general/precision_kinds.F90"
53 : class(elpa_abstract_impl_t), intent(inout) :: obj
54 : integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
55 : #ifdef USE_ASSUMED_SIZE
56 : MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,*)
57 : #else
58 : MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,obj%local_ncols)
59 : #endif
60 : integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
61 : integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
62 : integer(kind=ik) :: n, nc, i, info
63 : integer(kind=ik) :: lcs, lce, lrs, lre
64 : integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
65 :
66 2592 : MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
67 : logical :: wantDebug
68 : logical :: success
69 : integer(kind=ik) :: istat, debug, error
70 : character(200) :: errorMessage
71 :
72 : call obj%timer%start("elpa_cholesky_&
73 : &MATH_DATATYPE&
74 : &_&
75 : &PRECISION&
76 2592 : &")
77 :
78 2592 : na = obj%na
79 2592 : lda = obj%local_nrows
80 2592 : nblk = obj%nblk
81 2592 : matrixCols = obj%local_ncols
82 :
83 2592 : call obj%get("mpi_comm_rows",mpi_comm_rows,error )
84 2592 : if (error .ne. ELPA_OK) then
85 0 : print *,"Problem getting option. Aborting..."
86 0 : stop
87 : endif
88 2592 : call obj%get("mpi_comm_cols",mpi_comm_cols,error)
89 2592 : if (error .ne. ELPA_OK) then
90 0 : print *,"Problem getting option. Aborting..."
91 0 : stop
92 : endif
93 :
94 2592 : call obj%get("debug",debug,error)
95 2592 : if (error .ne. ELPA_OK) then
96 0 : print *,"Problem getting option. Aborting..."
97 0 : stop
98 : endif
99 2592 : if (debug == 1) then
100 1728 : wantDebug = .true.
101 : else
102 864 : wantDebug = .false.
103 : endif
104 :
105 2592 : call obj%timer%start("mpi_communication")
106 2592 : call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
107 2592 : call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
108 2592 : call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
109 2592 : call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
110 2592 : call obj%timer%stop("mpi_communication")
111 2592 : success = .true.
112 :
113 : ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
114 :
115 2592 : tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
116 2592 : tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
117 :
118 2592 : l_rows_tile = tile_size/np_rows ! local rows of a tile
119 2592 : l_cols_tile = tile_size/np_cols ! local cols of a tile
120 :
121 2592 : l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
122 2592 : l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
123 :
124 2592 : allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
125 2592 : if (istat .ne. 0) then
126 : print *,"elpa_cholesky_&
127 0 : &MATH_DATATYPE&: error when allocating tmp1 "//errorMessage
128 0 : stop 1
129 : endif
130 :
131 2592 : allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
132 2592 : if (istat .ne. 0) then
133 : print *,"elpa_cholesky_&
134 : &MATH_DATATYPE&
135 0 : &: error when allocating tmp2 "//errorMessage
136 0 : stop 1
137 : endif
138 :
139 2592 : tmp1 = 0
140 2592 : tmp2 = 0
141 :
142 2592 : allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
143 2592 : if (istat .ne. 0) then
144 : print *,"elpa_cholesky_&
145 : &MATH_DATATYPE&
146 0 : &: error when allocating tmatr "//errorMessage
147 0 : stop 1
148 : endif
149 :
150 2592 : allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
151 2592 : if (istat .ne. 0) then
152 : print *,"elpa_cholesky_&
153 : &MATH_DATATYPE&
154 0 : &: error when allocating tmatc "//errorMessage
155 0 : stop 1
156 : endif
157 :
158 2592 : tmatr = 0
159 2592 : tmatc = 0
160 :
161 25920 : do n = 1, na, nblk
162 : ! Calculate first local row and column of the still remaining matrix
163 : ! on the local processor
164 :
165 25920 : l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
166 25920 : l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
167 :
168 25920 : l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
169 25920 : l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
170 :
171 25920 : if (n+nblk > na) then
172 :
173 : ! This is the last step, just do a Cholesky-Factorization
174 : ! of the remaining block
175 :
176 2592 : if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
177 1728 : call obj%timer%start("blas")
178 :
179 1728 : call PRECISION_POTRF('U', na-n+1, a(l_row1,l_col1), lda, info)
180 1728 : call obj%timer%stop("blas")
181 :
182 1728 : if (info/=0) then
183 0 : if (wantDebug) write(error_unit,*) "elpa_cholesky_&
184 : &MATH_DATATYPE&
185 :
186 : #if REALCASE == 1
187 0 : &: Error in dpotrf: ",info
188 : #endif
189 : #if COMPLEXCASE == 1
190 0 : &: Error in zpotrf: ",info
191 : #endif
192 0 : success = .false.
193 0 : return
194 : endif
195 :
196 : endif
197 :
198 2592 : exit ! Loop
199 :
200 : endif
201 :
202 23328 : if (my_prow==prow(n, nblk, np_rows)) then
203 :
204 15552 : if (my_pcol==pcol(n, nblk, np_cols)) then
205 :
206 : ! The process owning the upper left remaining block does the
207 : ! Cholesky-Factorization of this block
208 15552 : call obj%timer%start("blas")
209 :
210 15552 : call PRECISION_POTRF('U', nblk, a(l_row1,l_col1), lda, info)
211 15552 : call obj%timer%stop("blas")
212 :
213 15552 : if (info/=0) then
214 0 : if (wantDebug) write(error_unit,*) "elpa_cholesky_&
215 : &MATH_DATATYPE&
216 :
217 : #if REALCASE == 1
218 0 : &: Error in dpotrf 2: ",info
219 : #endif
220 : #if COMPLEXCASE == 1
221 0 : &: Error in zpotrf 2: ",info
222 :
223 : #endif
224 0 : success = .false.
225 0 : return
226 : endif
227 :
228 15552 : nc = 0
229 264384 : do i=1,nblk
230 248832 : tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
231 248832 : nc = nc+i
232 : enddo
233 : endif
234 : #ifdef WITH_MPI
235 7776 : call obj%timer%start("mpi_communication")
236 :
237 : call MPI_Bcast(tmp1, nblk*(nblk+1)/2, &
238 : #if REALCASE == 1
239 : MPI_REAL_PRECISION, &
240 : #endif
241 : #if COMPLEXCASE == 1
242 : MPI_COMPLEX_PRECISION, &
243 : #endif
244 7776 : pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
245 :
246 7776 : call obj%timer%stop("mpi_communication")
247 :
248 : #endif /* WITH_MPI */
249 15552 : nc = 0
250 264384 : do i=1,nblk
251 248832 : tmp2(1:i,i) = tmp1(nc+1:nc+i)
252 248832 : nc = nc+i
253 : enddo
254 :
255 15552 : call obj%timer%start("blas")
256 15552 : if (l_cols-l_colx+1>0) &
257 : call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', nblk, l_cols-l_colx+1, ONE, tmp2, &
258 15552 : ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
259 15552 : call obj%timer%stop("blas")
260 : endif
261 :
262 396576 : do i=1,nblk
263 :
264 : #if REALCASE == 1
265 186624 : if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
266 : #endif
267 : #if COMPLEXCASE == 1
268 186624 : if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
269 : #endif
270 :
271 : #ifdef WITH_MPI
272 :
273 248832 : call obj%timer%start("mpi_communication")
274 248832 : if (l_cols-l_colx+1>0) &
275 : call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_MATH_DATATYPE_PRECISION, &
276 248832 : prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
277 :
278 248832 : call obj%timer%stop("mpi_communication")
279 : #endif /* WITH_MPI */
280 : enddo
281 : ! this has to be checked since it was changed substantially when doing type safe
282 : call elpa_transpose_vectors_&
283 : &MATH_DATATYPE&
284 : &_&
285 : &PRECISION &
286 : (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
287 : tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
288 23328 : n, na, nblk, nblk)
289 :
290 54432 : do i=0,(na-1)/tile_size
291 31104 : lcs = max(l_colx,i*l_cols_tile+1)
292 31104 : lce = min(l_cols,(i+1)*l_cols_tile)
293 31104 : lrs = l_rowx
294 31104 : lre = min(l_rows,(i+1)*l_rows_tile)
295 31104 : if (lce<lcs .or. lre<lrs) cycle
296 28512 : call obj%timer%start("blas")
297 : call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre-lrs+1, lce-lcs+1, nblk, -ONE, &
298 : tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
299 28512 : ONE, a(lrs,lcs), lda)
300 28512 : call obj%timer%stop("blas")
301 :
302 : enddo
303 :
304 : enddo
305 :
306 2592 : deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
307 2592 : if (istat .ne. 0) then
308 : print *,"elpa_cholesky_&
309 : &MATH_DATATYPE&
310 0 : &: error when deallocating tmp1 "//errorMessage
311 0 : stop 1
312 : endif
313 :
314 : ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
315 :
316 391392 : do i=1,na
317 388800 : if (my_pcol==pcol(i, nblk, np_cols)) then
318 : ! column i is on local processor
319 388800 : l_col1 = local_index(i , my_pcol, np_cols, nblk, +1) ! local column number
320 388800 : l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
321 388800 : a(l_row1:l_rows,l_col1) = 0
322 : endif
323 : enddo
324 : call obj%timer%stop("elpa_cholesky_&
325 : &MATH_DATATYPE&
326 : &_&
327 : &PRECISION&
328 2592 : &")
|