Line data Source code
1 : ! This file is part of ELPA.
2 : !
3 : ! The ELPA library was originally created by the ELPA consortium,
4 : ! consisting of the following organizations:
5 : !
6 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : !
19 : ! More information can be found here:
20 : ! http://elpa.mpcdf.mpg.de/
21 : !
22 : ! ELPA is free software: you can redistribute it and/or modify
23 : ! it under the terms of the version 3 of the license of the
24 : ! GNU Lesser General Public License as published by the Free
25 : ! Software Foundation.
26 : !
27 : ! ELPA is distributed in the hope that it will be useful,
28 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
29 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 : ! GNU Lesser General Public License for more details.
31 : !
32 : ! You should have received a copy of the GNU Lesser General Public License
33 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
34 : !
35 : ! ELPA reflects a substantial effort on the part of the original
36 : ! ELPA consortium, and we ask you to respect the spirit of the
37 : ! license that we chose: i.e., please contribute any changes you
38 : ! may have back to the original ELPA library distribution, and keep
39 : ! any derivatives of ELPA under the same license that we chose for
40 : ! the original distribution, the GNU Lesser General Public License.
41 : !
42 : !
43 : #include "config-f90.h"
44 : #ifndef ALREADY_DEFINED
45 0 : subroutine local_size_offset_1d(n,nb,baseidx,idx,rev,rank,nprocs, &
46 : lsize,baseoffset,offset)
47 :
48 : use precision
49 : use ELPA1_compute
50 :
51 : implicit none
52 :
53 : ! input
54 : integer(kind=ik) :: n,nb,baseidx,idx,rev,rank,nprocs
55 :
56 : ! output
57 : integer(kind=ik) :: lsize,baseoffset,offset
58 :
59 : ! local scalars
60 : integer(kind=ik) :: rank_idx
61 :
62 0 : rank_idx = MOD((idx-1)/nb,nprocs)
63 :
64 : ! calculate local size and offsets
65 0 : if (rev .eq. 1) then
66 0 : if (idx > 0) then
67 0 : lsize = local_index(idx,rank,nprocs,nb,-1)
68 : else
69 0 : lsize = 0
70 : end if
71 :
72 0 : baseoffset = 1
73 0 : offset = 1
74 : else
75 0 : offset = local_index(idx,rank,nprocs,nb,1)
76 0 : baseoffset = local_index(baseidx,rank,nprocs,nb,1)
77 :
78 0 : lsize = local_index(n,rank,nprocs,nb,-1)
79 : !print *,'baseidx,idx',baseidx,idx,lsize,n
80 :
81 0 : lsize = lsize - offset + 1
82 :
83 0 : baseoffset = offset - baseoffset + 1
84 : end if
85 :
86 0 : end subroutine local_size_offset_1d
87 : #endif
88 :
89 : subroutine reverse_vector_local_&
90 0 : &PRECISION &
91 : (n,x,incx,work,lwork)
92 : use precision
93 : implicit none
94 : #include "../../general/precision_kinds.F90"
95 :
96 : ! input
97 : integer(kind=ik) :: incx, n, lwork
98 : real(kind=C_DATATYPE_KIND) :: x(*), work(*)
99 :
100 : ! local scalars
101 : real(kind=C_DATATYPE_KIND) :: temp
102 : integer(kind=ik) :: srcoffset, destoffset, ientry
103 :
104 0 : if (lwork .eq. -1) then
105 0 : work(1) = 0.0_rk
106 0 : return
107 : end if
108 :
109 0 : do ientry=1,n/2
110 0 : srcoffset=1+(ientry-1)*incx
111 0 : destoffset=1+(n-ientry)*incx
112 :
113 0 : temp = x(srcoffset)
114 0 : x(srcoffset) = x(destoffset)
115 0 : x(destoffset) = temp
116 : end do
117 :
118 : end subroutine
119 :
120 : subroutine reverse_matrix_local_&
121 0 : &PRECISION &
122 0 : (trans, m, n, a, lda, work, lwork)
123 : use precision
124 : implicit none
125 :
126 : ! input
127 : integer(kind=ik) :: lda,m,n,lwork,trans
128 : real(kind=C_DATATYPE_KIND) :: a(lda,*),work(*)
129 :
130 : ! local scalars
131 : real(kind=C_DATATYPE_KIND) :: dworksize(1)
132 : integer(kind=ik) :: incx
133 : integer(kind=ik) :: dimsize
134 : integer(kind=ik) :: i
135 :
136 0 : if (trans .eq. 1) then
137 0 : incx = lda
138 0 : dimsize = n
139 : else
140 0 : incx = 1
141 0 : dimsize = m
142 : end if
143 :
144 0 : if (lwork .eq. -1) then
145 : call reverse_vector_local_&
146 : &PRECISION &
147 0 : (dimsize, a, incx, dworksize, -1)
148 0 : work(1) = dworksize(1)
149 0 : return
150 : end if
151 :
152 0 : if (trans .eq. 1) then
153 0 : do i=1,m
154 : call reverse_vector_local_&
155 : &PRECISION &
156 0 : (dimsize, a(i,1), incx, work, lwork)
157 : end do
158 : else
159 0 : do i=1,n
160 : call reverse_vector_local_&
161 : &PRECISION &
162 0 : (dimsize, a(1,i), incx, work, lwork)
163 : end do
164 : end if
165 :
166 : end subroutine
167 :
168 : subroutine reverse_matrix_2dcomm_ref_&
169 0 : &PRECISION &
170 0 : (m, n, mb, nb, a, lda, work, lwork, mpicomm_cols, mpicomm_rows)
171 : use precision
172 : implicit none
173 :
174 : ! input
175 : integer(kind=ik) :: m, n, lda, lwork, mpicomm_cols, mpicomm_rows, mb, nb
176 : real(kind=C_DATATYPE_KIND) :: a(lda,*),work(*)
177 :
178 : ! local scalars
179 : real(kind=C_DATATYPE_KIND) :: reverse_column_size(1)
180 : real(kind=C_DATATYPE_KIND) :: reverse_row_size(1)
181 :
182 : integer(kind=ik) :: mpirank_cols, mpirank_rows
183 : integer(kind=ik) :: mpiprocs_cols, mpiprocs_rows
184 : integer(kind=ik) :: mpierr
185 : integer(kind=ik) :: lrows, lcols, offset, baseoffset
186 :
187 0 : call MPI_Comm_rank(mpicomm_cols,mpirank_cols,mpierr)
188 0 : call MPI_Comm_rank(mpicomm_rows,mpirank_rows,mpierr)
189 0 : call MPI_Comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
190 0 : call MPI_Comm_size(mpicomm_rows,mpiprocs_rows,mpierr)
191 : call local_size_offset_1d(m,mb,1,1,0,mpirank_cols,mpiprocs_cols, &
192 0 : lrows,baseoffset,offset)
193 :
194 : call local_size_offset_1d(n,nb,1,1,0,mpirank_rows,mpiprocs_rows, &
195 0 : lcols,baseoffset,offset)
196 :
197 0 : if (lwork .eq. -1) then
198 : call reverse_matrix_1dcomm_&
199 : &PRECISION &
200 0 : (0,m,lcols,mb,a,lda,reverse_column_size,-1,mpicomm_cols)
201 : call reverse_matrix_1dcomm_&
202 : &PRECISION &
203 0 : (1,lrows,n,nb,a,lda,reverse_row_size,-1,mpicomm_rows)
204 0 : work(1) = max(reverse_column_size(1),reverse_row_size(1))
205 0 : return
206 : end if
207 :
208 : call reverse_matrix_1dcomm_&
209 : &PRECISION &
210 0 : (0,m,lcols,mb,a,lda,work,lwork,mpicomm_cols)
211 : call reverse_matrix_1dcomm_&
212 : &PRECISION &
213 0 : (1,lrows,n,nb,a,lda,work,lwork,mpicomm_rows)
214 : end subroutine
215 :
216 : ! b: if trans = 'N': b is size of block distribution between rows
217 : ! b: if trans = 'T': b is size of block distribution between columns
218 : subroutine reverse_matrix_1dcomm_&
219 0 : &PRECISION &
220 0 : (trans, m, n, b, a, lda, work, lwork, mpicomm)
221 : use precision
222 : use elpa_mpi
223 :
224 : implicit none
225 :
226 : ! input
227 : integer(kind=ik) :: trans
228 : integer(kind=ik) :: m, n, b, lda, lwork, mpicomm
229 : real(kind=C_DATATYPE_KIND) :: a(lda,*), work(*)
230 :
231 : ! local scalars
232 : integer(kind=ik) :: mpirank,mpiprocs,mpierr
233 : #ifdef WITH_MPI
234 : integer(kind=ik) :: my_mpistatus(MPI_STATUS_SIZE)
235 : #endif
236 : integer(kind=ik) :: nr_blocks,dest_process,src_process,step
237 : integer(kind=ik) :: lsize,baseoffset,offset
238 : integer(kind=ik) :: current_index,destblk,srcblk,icol,next_index
239 : integer(kind=ik) :: sendcount,recvcount
240 : integer(kind=ik) :: sendoffset,recvoffset
241 : integer(kind=ik) :: newmatrix_offset,work_offset
242 : integer(kind=ik) :: lcols,lrows,lroffset,lcoffset,dimsize,fixedsize
243 : real(kind=C_DATATYPE_KIND) :: dworksize(1)
244 :
245 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
246 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
247 0 : if (trans .eq. 1) then
248 : call local_size_offset_1d(n,b,1,1,0,mpirank,mpiprocs, &
249 0 : lcols,baseoffset,lcoffset)
250 0 : lrows = m
251 : else
252 : call local_size_offset_1d(m,b,1,1,0,mpirank,mpiprocs, &
253 0 : lrows,baseoffset,lroffset)
254 0 : lcols = n
255 : end if
256 :
257 0 : if (lwork .eq. -1) then
258 : call reverse_matrix_local_&
259 : &PRECISION &
260 0 : (trans,lrows,lcols,a,max(lrows,lcols),dworksize,-1)
261 0 : work(1) = real(3*lrows*lcols,kind=REAL_DATATYPE) + dworksize(1)
262 0 : return
263 : end if
264 :
265 0 : sendoffset = 1
266 0 : recvoffset = sendoffset + lrows*lcols
267 0 : newmatrix_offset = recvoffset + lrows*lcols
268 0 : work_offset = newmatrix_offset + lrows*lcols
269 :
270 0 : if (trans .eq. 1) then
271 0 : dimsize = n
272 0 : fixedsize = m
273 : else
274 0 : dimsize = m
275 0 : fixedsize = n
276 : end if
277 :
278 0 : if (dimsize .le. 1) then
279 0 : return ! nothing to do
280 : end if
281 :
282 : ! 1. adjust step size to remainder size
283 0 : nr_blocks = dimsize / b
284 0 : nr_blocks = nr_blocks * b
285 0 : step = dimsize - nr_blocks
286 0 : if (step .eq. 0) step = b
287 :
288 : ! 2. iterate over destination blocks starting with process 0
289 0 : current_index = 1
290 0 : do while (current_index .le. dimsize)
291 0 : destblk = (current_index-1) / b
292 0 : dest_process = mod(destblk,mpiprocs)
293 0 : srcblk = (dimsize-current_index) / b
294 0 : src_process = mod(srcblk,mpiprocs)
295 :
296 0 : next_index = current_index+step
297 :
298 : ! block for dest_process is located on mpirank if lsize > 0
299 : call local_size_offset_1d(dimsize-current_index+1,b,dimsize-next_index+2,dimsize-next_index+2,0, &
300 0 : src_process,mpiprocs,lsize,baseoffset,offset)
301 :
302 0 : sendcount = lsize*fixedsize
303 0 : recvcount = sendcount
304 :
305 : ! TODO: this send/recv stuff seems to blow up on BlueGene/P
306 : ! TODO: is there actually room for the requested matrix part? the target
307 : ! process might not have any parts at all (thus no room)
308 0 : if ((src_process .eq. mpirank) .and. (dest_process .eq. src_process)) then
309 : ! 5. pack data
310 0 : if (trans .eq. 1) then
311 0 : do icol=offset,offset+lsize-1
312 : work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
313 0 : a(1:lrows,icol)
314 : end do
315 : else
316 0 : do icol=1,lcols
317 : work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
318 0 : a(offset:offset+lsize-1,icol)
319 : end do
320 : end if
321 :
322 : ! 7. reverse data
323 0 : if (trans .eq. 1) then
324 : call reverse_matrix_local_&
325 : &PRECISION &
326 0 : (1,lrows,lsize,work(sendoffset),lrows,work(work_offset),lwork)
327 : else
328 : call reverse_matrix_local_&
329 : &PRECISION &
330 0 : (0,lsize,lcols,work(sendoffset),lsize,work(work_offset),lwork)
331 : end if
332 :
333 : ! 8. store in temp matrix
334 0 : if (trans .eq. 1) then
335 0 : do icol=1,lsize
336 : work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
337 0 : work(sendoffset+(icol-1)*lrows:sendoffset+icol*lrows-1)
338 : end do
339 :
340 0 : newmatrix_offset = newmatrix_offset + lsize*lrows
341 : else
342 0 : do icol=1,lcols
343 : work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
344 0 : work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1)
345 : end do
346 :
347 0 : newmatrix_offset = newmatrix_offset + lsize
348 : end if
349 : else
350 :
351 0 : if (dest_process .eq. mpirank) then
352 : ! 6b. call MPI_Recv
353 :
354 : #ifdef WITH_MPI
355 :
356 :
357 : call MPI_Recv(work(recvoffset), recvcount, MPI_REAL_PRECISION, &
358 0 : src_process, current_index, mpicomm, my_mpistatus, mpierr)
359 :
360 : #else /* WITH_MPI */
361 0 : work(recvoffset:recvoffset+recvcount-1) = work(sendoffset:sendoffset+sendcount-1)
362 : #endif /* WITH_MPI */
363 :
364 : ! 7. reverse data
365 0 : if (trans .eq. 1) then
366 : call reverse_matrix_local_&
367 : &PRECISION &
368 0 : (1,lrows,lsize,work(recvoffset),lrows,work(work_offset),lwork)
369 : else
370 : call reverse_matrix_local_&
371 : &PRECISION &
372 0 : (0,lsize,lcols,work(recvoffset),lsize,work(work_offset),lwork)
373 : end if
374 :
375 : ! 8. store in temp matrix
376 0 : if (trans .eq. 1) then
377 0 : do icol=1,lsize
378 : work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
379 0 : work(recvoffset+(icol-1)*lrows:recvoffset+icol*lrows-1)
380 : end do
381 :
382 0 : newmatrix_offset = newmatrix_offset + lsize*lrows
383 : else
384 0 : do icol=1,lcols
385 : work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
386 0 : work(recvoffset+(icol-1)*lsize:recvoffset+icol*lsize-1)
387 : end do
388 :
389 0 : newmatrix_offset = newmatrix_offset + lsize
390 : end if
391 : end if
392 :
393 0 : if (src_process .eq. mpirank) then
394 : ! 5. pack data
395 0 : if (trans .eq. 1) then
396 0 : do icol=offset,offset+lsize-1
397 : work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
398 0 : a(1:lrows,icol)
399 : end do
400 : else
401 0 : do icol=1,lcols
402 : work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
403 0 : a(offset:offset+lsize-1,icol)
404 : end do
405 : end if
406 :
407 : ! 6a. call MPI_Send
408 : #ifdef WITH_MPI
409 :
410 : call MPI_Send(work(sendoffset), sendcount, MPI_REAL_PRECISION, &
411 0 : dest_process, current_index, mpicomm, mpierr)
412 : #endif /* WITH_MPI */
413 : end if
414 : end if
415 :
416 0 : current_index = next_index
417 : end do
418 :
419 : ! 9. copy temp matrix to real matrix
420 0 : newmatrix_offset = recvoffset + lrows*lcols
421 0 : do icol=1,lcols
422 : a(1:lrows,icol) = &
423 0 : work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1)
424 : end do
425 : end subroutine
|