Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : !
20 : ! More information can be found here:
21 : ! http://elpa.mpcdf.mpg.de/
22 : !
23 : ! ELPA is free software: you can redistribute it and/or modify
24 : ! it under the terms of the version 3 of the license of the
25 : ! GNU Lesser General Public License as published by the Free
26 : ! Software Foundation.
27 : !
28 : ! ELPA is distributed in the hope that it will be useful,
29 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
30 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 : ! GNU Lesser General Public License for more details.
32 : !
33 : ! You should have received a copy of the GNU Lesser General Public License
34 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
35 : !
36 : ! ELPA reflects a substantial effort on the part of the original
37 : ! ELPA consortium, and we ask you to respect the spirit of the
38 : ! license that we chose: i.e., please contribute any changes you
39 : ! may have back to the original ELPA library distribution, and keep
40 : ! any derivatives of ELPA under the same license that we chose for
41 : ! the original distribution, the GNU Lesser General Public License.
42 : !
43 : ! Author: Andreas Marek, MPCDF
44 : #endif
45 :
46 : #include "config-f90.h"
47 : #include "../general/sanity.F90"
48 :
49 : subroutine elpa_reduce_add_vectors_&
50 : &MATH_DATATYPE&
51 : &_&
52 1227264 : &PRECISION &
53 1227264 : (obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvr, nvc, nblk)
54 :
55 : !-------------------------------------------------------------------------------
56 : ! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
57 : ! The result of the reduce is gathered on the processors owning the diagonal
58 : ! and added to the array of vectors vmat_t (which is distributed over comm_t).
59 : !
60 : ! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
61 : ! in the different members within vmat_t (else a reduce wouldn't be necessary).
62 : ! After this routine, an allreduce of vmat_t has to be done.
63 : !
64 : ! vmat_s array of vectors to be reduced and added
65 : ! ld_s leading dimension of vmat_s
66 : ! comm_s communicator over which vmat_s is distributed
67 : ! vmat_t array of vectors to which vmat_s is added
68 : ! ld_t leading dimension of vmat_t
69 : ! comm_t communicator over which vmat_t is distributed
70 : ! nvr global length of vmat_s/vmat_t
71 : ! nvc number of columns in vmat_s/vmat_t
72 : ! nblk block size of block cyclic distribution
73 : !
74 : !-------------------------------------------------------------------------------
75 :
76 : use precision
77 : #ifdef WITH_OPENMP
78 : use omp_lib
79 : #endif
80 : use elpa_mpi
81 : use elpa_abstract_impl
82 : implicit none
83 :
84 : class(elpa_abstract_impl_t), intent(inout) :: obj
85 : integer(kind=ik), intent(in) :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
86 : MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in) :: vmat_s(ld_s,nvc)
87 : MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: vmat_t(ld_t,nvc)
88 :
89 1227264 : MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: aux1(:), aux2(:)
90 : integer(kind=ik) :: myps, mypt, nps, npt
91 : integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr
92 : integer(kind=ik) :: lcm_s_t, nblks_tot
93 : integer(kind=ik) :: auxstride
94 :
95 : call obj%timer%start("elpa_reduce_add_vectors_&
96 : &MATH_DATATYPE&
97 : &" // &
98 : &PRECISION_SUFFIX &
99 1227264 : )
100 :
101 1227264 : call obj%timer%start("mpi_communication")
102 1227264 : call mpi_comm_rank(comm_s,myps,mpierr)
103 1227264 : call mpi_comm_size(comm_s,nps ,mpierr)
104 1227264 : call mpi_comm_rank(comm_t,mypt,mpierr)
105 1227264 : call mpi_comm_size(comm_t,npt ,mpierr)
106 1227264 : call obj%timer%stop("mpi_communication")
107 :
108 : ! Look to elpa_transpose_vectors for the basic idea!
109 :
110 : ! The communictation pattern repeats in the global matrix after
111 : ! the least common multiple of (nps,npt) blocks
112 :
113 1227264 : lcm_s_t = least_common_multiple(nps,npt) ! least common multiple of nps, npt
114 :
115 1227264 : nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
116 :
117 1227264 : allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
118 1227264 : allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
119 1227264 : aux1(:) = 0
120 1227264 : aux2(:) = 0
121 : #ifdef WITH_OPENMP
122 1227264 : !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
123 : #endif
124 3193920 : do n = 0, lcm_s_t-1
125 :
126 1966656 : ips = mod(n,nps)
127 1966656 : ipt = mod(n,npt)
128 :
129 1966656 : auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)
130 :
131 1966656 : if(myps == ips) then
132 :
133 : ! k = 0
134 : #ifdef WITH_OPENMP
135 613632 : !$omp do
136 : #endif
137 2057472 : do lc=1,nvc
138 77250048 : do i = n, nblks_tot-1, lcm_s_t
139 72918528 : k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
140 72918528 : ns = (i/nps)*nblk ! local start of block i
141 108379536 : nl = min(nvr-i*nblk,nblk) ! length
142 653106672 : aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
143 : ! k = k+nblk
144 : enddo
145 : enddo
146 :
147 1227264 : k = nvc * auxstride
148 : #ifdef WITH_OPENMP
149 613632 : !$omp barrier
150 613632 : !$omp master
151 : #endif
152 :
153 : #ifdef WITH_MPI
154 739392 : call obj%timer%start("mpi_communication")
155 :
156 739392 : if (k>0) call mpi_reduce(aux1, aux2, k, &
157 : #if REALCASE == 1
158 : MPI_REAL_PRECISION, &
159 : #endif
160 : #if COMPLEXCASE == 1
161 : MPI_COMPLEX_PRECISION, &
162 : #endif
163 739392 : MPI_SUM, ipt, comm_t, mpierr)
164 :
165 739392 : call obj%timer%stop("mpi_communication")
166 :
167 : #else /* WITH_MPI */
168 487872 : if(k>0) aux2 = aux1
169 : #endif /* WITH_MPI */
170 :
171 : #ifdef WITH_OPENMP
172 : !$omp end master
173 613632 : !$omp barrier
174 : #endif
175 1227264 : if (mypt == ipt) then
176 : ! k = 0
177 : #ifdef WITH_OPENMP
178 613632 : !$omp do
179 : #endif
180 2057472 : do lc=1,nvc
181 77250048 : do i = n, nblks_tot-1, lcm_s_t
182 72918528 : k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
183 72918528 : ns = (i/npt)*nblk ! local start of block i
184 108379536 : nl = min(nvr-i*nblk,nblk) ! length
185 653106672 : vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
186 : ! k = k+nblk
187 : enddo
188 : enddo
189 : endif
190 :
191 : endif
192 :
193 : enddo
194 : #ifdef WITH_OPENMP
195 : !$omp end parallel
196 : #endif
197 :
198 1227264 : deallocate(aux1)
199 1227264 : deallocate(aux2)
200 :
201 : call obj%timer%stop("elpa_reduce_add_vectors_&
202 : &MATH_DATATYPE&
203 : &" // &
204 : &PRECISION_SUFFIX &
205 1227264 : )
206 1227264 : end subroutine
207 :
208 :
|