Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : ! This particular source code file contains additions, changes and
20 : ! enhancements authored by Intel Corporation which is not part of
21 : ! the ELPA consortium.
22 : !
23 : ! More information can be found here:
24 : ! http://elpa.mpcdf.mpg.de/
25 : !
26 : ! ELPA is free software: you can redistribute it and/or modify
27 : ! it under the terms of the version 3 of the license of the
28 : ! GNU Lesser General Public License as published by the Free
29 : ! Software Foundation.
30 : !
31 : ! ELPA is distributed in the hope that it will be useful,
32 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
33 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 : ! GNU Lesser General Public License for more details.
35 : !
36 : ! You should have received a copy of the GNU Lesser General Public License
37 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 : !
39 : ! ELPA reflects a substantial effort on the part of the original
40 : ! ELPA consortium, and we ask you to respect the spirit of the
41 : ! license that we chose: i.e., please contribute any changes you
42 : ! may have back to the original ELPA library distribution, and keep
43 : ! any derivatives of ELPA under the same license that we chose for
44 : ! the original distribution, the GNU Lesser General Public License.
45 : !
46 : ! ELPA2 -- 2-stage solver for ELPA
47 : !
48 : ! Copyright of the original code rests with the authors inside the ELPA
49 : ! consortium. The copyright of any additional modifications shall rest
50 : ! with their original authors, but shall adhere to the licensing terms
51 : ! distributed along with the original code in the file "COPYING".
52 : !
53 : ! Author: Andreas Marek, MPCDF
54 : #endif
55 :
56 : #include "../general/sanity.F90"
57 :
58 : #define REALCASE 1
59 : #undef COMPLEXCASE
60 : #include "elpa2_bandred_template.F90"
61 : #define REALCASE 1
62 : #include "elpa2_symm_matrix_allreduce_real_template.F90"
63 : #undef REALCASE
64 : #define REALCASE 1
65 : #include "elpa2_trans_ev_band_to_full_template.F90"
66 : #include "elpa2_tridiag_band_template.F90"
67 : #include "elpa2_trans_ev_tridi_to_band_template.F90"
68 :
69 :
70 :
71 : subroutine band_band_real_&
72 0 : &PRECISION &
73 0 : (obj, na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, communicator)
74 : !-------------------------------------------------------------------------------
75 : ! band_band_real:
76 : ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
77 : ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
78 : ! zero entries, if this
79 : ! requirement doesn't hold)
80 : !
81 : ! na Order of matrix
82 : !
83 : ! nb Semi bandwidth of original matrix
84 : !
85 : ! nb2 Semi bandwidth of target matrix
86 : !
87 : ! ab Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
88 : ! has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
89 : ! are located on rank n.
90 : !
91 : ! ab2 Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
92 : ! accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
93 : ! on rank n.
94 : !
95 : ! d(na) Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
96 : !
97 : ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
98 : !
99 : ! communicator
100 : ! MPI-Communicator for the total processor set
101 : !-------------------------------------------------------------------------------
102 : use elpa_abstract_impl
103 : use elpa2_workload
104 : use precision
105 : implicit none
106 : #include "../general/precision_kinds.F90"
107 : class(elpa_abstract_impl_t), intent(inout) :: obj
108 : integer(kind=ik), intent(in) :: na, nb, nbCol, nb2, nb2Col, communicator
109 : real(kind=rk), intent(inout) :: ab(2*nb,nbCol) ! removed assumed size
110 : real(kind=rk), intent(inout) :: ab2(2*nb2,nb2Col) ! removed assumed size
111 : real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
112 :
113 0 : real(kind=rk) :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
114 0 : tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
115 :
116 0 : real(kind=rk) :: work(nb*nb2), work2(nb2*nb2)
117 : integer(kind=ik) :: lwork, info
118 :
119 : integer(kind=ik) :: istep, i, n, dest
120 : integer(kind=ik) :: n_off, na_s
121 : integer(kind=ik) :: my_pe, n_pes, mpierr
122 : integer(kind=ik) :: nblocks_total, nblocks
123 : integer(kind=ik) :: nblocks_total2, nblocks2
124 : integer(kind=ik) :: ireq_ab, ireq_hv
125 : #ifdef WITH_MPI
126 : ! integer(kind=ik) :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
127 : #endif
128 : ! integer(kind=ik), allocatable :: mpi_statuses(:,:)
129 0 : integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:)
130 :
131 : integer(kind=ik) :: j, nc, nr, ns, ne, iblk
132 : integer(kind=ik) :: istat
133 : character(200) :: errorMessage
134 :
135 0 : call obj%timer%start("band_band_real" // PRECISION_SUFFIX)
136 :
137 0 : call obj%timer%start("mpi_communication")
138 0 : call mpi_comm_rank(communicator,my_pe,mpierr)
139 0 : call mpi_comm_size(communicator,n_pes,mpierr)
140 0 : call obj%timer%stop("mpi_communication")
141 :
142 : ! Total number of blocks in the band:
143 0 : nblocks_total = (na-1)/nb + 1
144 0 : nblocks_total2 = (na-1)/nb2 + 1
145 :
146 : ! Set work distribution
147 0 : allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
148 0 : if (istat .ne. 0) then
149 0 : print *,"error allocating block_limits "//errorMessage
150 0 : stop 1
151 : endif
152 0 : call divide_band(obj, nblocks_total, n_pes, block_limits)
153 :
154 0 : allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
155 0 : if (istat .ne. 0) then
156 0 : print *,"error allocating block_limits2 "//errorMessage
157 0 : stop 1
158 : endif
159 :
160 0 : call divide_band(obj, nblocks_total2, n_pes, block_limits2)
161 :
162 : ! nblocks: the number of blocks for my task
163 0 : nblocks = block_limits(my_pe+1) - block_limits(my_pe)
164 0 : nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
165 :
166 0 : allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
167 0 : if (istat .ne. 0) then
168 0 : print *,"error allocating ireq_ab2 "//errorMessage
169 0 : stop 1
170 : endif
171 :
172 : #ifdef WITH_MPI
173 0 : call obj%timer%start("mpi_communication")
174 :
175 0 : ireq_ab2 = MPI_REQUEST_NULL
176 :
177 0 : if (nb2>1) then
178 0 : do i=0,nblocks2-1
179 :
180 0 : call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, MPI_REAL_PRECISION, 0, 3, communicator, ireq_ab2(i+1), mpierr)
181 : enddo
182 : endif
183 0 : call obj%timer%stop("mpi_communication")
184 :
185 : #else /* WITH_MPI */
186 : ! carefull the "recieve" has to be done at the corresponding send or wait
187 : ! if (nb2>1) then
188 : ! do i=0,nblocks2-1
189 : ! ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
190 : ! enddo
191 : ! endif
192 :
193 : #endif /* WITH_MPI */
194 : ! n_off: Offset of ab within band
195 0 : n_off = block_limits(my_pe)*nb
196 0 : lwork = nb*nb2
197 0 : dest = 0
198 : #ifdef WITH_MPI
199 0 : ireq_ab = MPI_REQUEST_NULL
200 0 : ireq_hv = MPI_REQUEST_NULL
201 : #endif
202 : ! ---------------------------------------------------------------------------
203 : ! Start of calculations
204 :
205 0 : na_s = block_limits(my_pe)*nb + 1
206 :
207 0 : if (my_pe>0 .and. na_s<=na) then
208 : ! send first nb2 columns to previous PE
209 : ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
210 0 : do i=1,nb2
211 0 : ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
212 : enddo
213 : #ifdef WITH_MPI
214 0 : call obj%timer%start("mpi_communication")
215 :
216 0 : call mpi_isend(ab_s, (nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, communicator, ireq_ab, mpierr)
217 0 : call obj%timer%stop("mpi_communication")
218 : #endif /* WITH_MPI */
219 : endif
220 :
221 0 : do istep=1,na/nb2
222 :
223 0 : if (my_pe==0) then
224 :
225 0 : n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
226 0 : hv(:,:) = 0.0_rk
227 0 : tau(:) = 0.0_rk
228 :
229 : ! The last step (istep=na-1) is only needed for sending the last HH vectors.
230 : ! We don't want the sign of the last element flipped (analogous to the other sweeps)
231 0 : if (istep < na/nb2) then
232 :
233 : ! Transform first block column of remaining matrix
234 0 : call obj%timer%start("blas")
235 0 : call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
236 0 : call obj%timer%stop("blas")
237 :
238 0 : do i=1,nb2
239 0 : hv(i,i) = 1.0_rk
240 0 : hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
241 0 : ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0.0_rk
242 : enddo
243 :
244 : endif
245 :
246 0 : if (nb2==1) then
247 0 : d(istep) = ab(1,na_s-n_off)
248 0 : e(istep) = ab(2,na_s-n_off)
249 0 : if (istep == na) then
250 0 : e(na) = 0.0_rk
251 : endif
252 : else
253 0 : ab_s2 = 0.0_rk
254 0 : ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
255 0 : if (block_limits2(dest+1)<istep) then
256 0 : dest = dest+1
257 : endif
258 : #ifdef WITH_MPI
259 0 : call obj%timer%start("mpi_communication")
260 0 : call mpi_send(ab_s2, 2*nb2*nb2, MPI_REAL_PRECISION, dest, 3, communicator, mpierr)
261 0 : call obj%timer%stop("mpi_communication")
262 :
263 : #else /* WITH_MPI */
264 : ! do irecv here
265 0 : if (nb2>1) then
266 0 : do i= 0,nblocks2-1
267 0 : ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
268 : enddo
269 : endif
270 : #endif /* WITH_MPI */
271 :
272 : endif
273 :
274 : else
275 0 : if (na>na_s+nb2-1) then
276 : ! Receive Householder vectors from previous task, from PE owning subdiagonal
277 : #ifdef WITH_MPI
278 0 : call obj%timer%start("mpi_communication")
279 0 : call mpi_recv(hv, nb*nb2, MPI_REAL_PRECISION, my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
280 0 : call obj%timer%stop("mpi_communication")
281 :
282 : #else /* WITH_MPI */
283 0 : hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
284 : #endif /* WITH_MPI */
285 :
286 0 : do i=1,nb2
287 0 : tau(i) = hv(i,i)
288 0 : hv(i,i) = 1.0_rk
289 : enddo
290 : endif
291 : endif
292 :
293 0 : na_s = na_s+nb2
294 0 : if (na_s-n_off > nb) then
295 0 : ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
296 0 : ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rk
297 0 : n_off = n_off + nb
298 : endif
299 :
300 0 : do iblk=1,nblocks
301 0 : ns = na_s + (iblk-1)*nb - n_off ! first column in block
302 0 : ne = ns+nb-nb2 ! last column in block
303 :
304 0 : if (ns+n_off>na) exit
305 :
306 0 : nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
307 0 : nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
308 : ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
309 : call wy_gen_&
310 : &PRECISION&
311 0 : &(obj,nc,nb2,w,hv,tau,work,nb)
312 :
313 0 : if (iblk==nblocks .and. nc==nb) then
314 : !request last nb2 columns
315 : #ifdef WITH_MPI
316 0 : call obj%timer%start("mpi_communication")
317 0 : call mpi_recv(ab_r,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
318 0 : call obj%timer%stop("mpi_communication")
319 :
320 : #else /* WITH_MPI */
321 0 : ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
322 : #endif /* WITH_MPI */
323 0 : do i=1,nb2
324 0 : ab(1:nb+1,ne+i-1) = ab_r(:,i)
325 : enddo
326 : endif
327 0 : hv_new(:,:) = 0.0_rk ! Needed, last rows must be 0 for nr < nb
328 0 : tau_new(:) = 0.0_rk
329 :
330 0 : if (nr>0) then
331 : call wy_right_&
332 : &PRECISION&
333 0 : &(obj,nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
334 0 : call obj%timer%start("blas")
335 0 : call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
336 0 : call obj%timer%stop("blas")
337 0 : do i=1,nb2
338 0 : hv_new(i,i) = 1.0_rk
339 0 : hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
340 0 : ab(nb+2:,ns+i-1) = 0.0_rk
341 : enddo
342 :
343 : !send hh-Vector
344 0 : if (iblk==nblocks) then
345 : #ifdef WITH_MPI
346 0 : call obj%timer%start("mpi_communication")
347 :
348 0 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
349 0 : call obj%timer%stop("mpi_communication")
350 :
351 : #endif
352 0 : hv_s = hv_new
353 0 : do i=1,nb2
354 0 : hv_s(i,i) = tau_new(i)
355 : enddo
356 : #ifdef WITH_MPI
357 0 : call obj%timer%start("mpi_communication")
358 0 : call mpi_isend(hv_s,nb*nb2, MPI_REAL_PRECISION, my_pe+1, 2, communicator, ireq_hv, mpierr)
359 0 : call obj%timer%stop("mpi_communication")
360 :
361 : #else /* WITH_MPI */
362 :
363 : #endif /* WITH_MPI */
364 : endif
365 : endif
366 :
367 : call wy_symm_&
368 : &PRECISION&
369 0 : &(obj,nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
370 :
371 0 : if (my_pe>0 .and. iblk==1) then
372 : !send first nb2 columns to previous PE
373 : #ifdef WITH_MPI
374 0 : call obj%timer%start("mpi_communication")
375 :
376 0 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
377 0 : call obj%timer%stop("mpi_communication")
378 :
379 : #endif
380 0 : do i=1,nb2
381 0 : ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
382 : enddo
383 : #ifdef WITH_MPI
384 0 : call obj%timer%start("mpi_communication")
385 0 : call mpi_isend(ab_s,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, communicator, ireq_ab, mpierr)
386 0 : call obj%timer%stop("mpi_communication")
387 :
388 : #else /* WITH_MPI */
389 :
390 : #endif /* WITH_MPI */
391 : endif
392 :
393 0 : if (nr>0) then
394 : call wy_gen_&
395 : &PRECISION&
396 0 : &(obj,nr,nb2,w_new,hv_new,tau_new,work,nb)
397 : call wy_left_&
398 : &PRECISION&
399 0 : &(obj,nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
400 : endif
401 :
402 : ! Use new HH Vector for the next block
403 0 : hv(:,:) = hv_new(:,:)
404 0 : tau = tau_new
405 : enddo
406 : enddo
407 :
408 : ! Finish the last outstanding requests
409 : #ifdef WITH_MPI
410 0 : call obj%timer%start("mpi_communication")
411 :
412 0 : call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
413 0 : call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
414 : ! allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
415 : ! if (istat .ne. 0) then
416 : ! print *,"error allocating mpi_statuses "//errorMessage
417 : ! stop 1
418 : ! endif
419 :
420 0 : call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
421 : ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
422 : ! if (istat .ne. 0) then
423 : ! print *,"error deallocating mpi_statuses "//errorMessage
424 : ! stop 1
425 : ! endif
426 :
427 0 : call mpi_barrier(communicator,mpierr)
428 0 : call obj%timer%stop("mpi_communication")
429 :
430 : #endif /* WITH_MPI */
431 :
432 0 : deallocate(block_limits, stat=istat, errmsg=errorMessage)
433 0 : if (istat .ne. 0) then
434 0 : print *,"error deallocating block_limits "//errorMessage
435 0 : stop 1
436 : endif
437 :
438 0 : deallocate(block_limits2, stat=istat, errmsg=errorMessage)
439 0 : if (istat .ne. 0) then
440 0 : print *,"error deallocating block_limits2 "//errorMessage
441 0 : stop 1
442 : endif
443 :
444 0 : deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
445 0 : if (istat .ne. 0) then
446 0 : print *,"error deallocating ireq_ab2 "//errorMessage
447 0 : stop 1
448 : endif
449 :
450 0 : call obj%timer%stop("band_band_real" // PRECISION_SUFFIX)
451 :
452 0 : end subroutine
453 :
454 : subroutine wy_gen_&
455 0 : &PRECISION&
456 0 : &(obj, n, nb, W, Y, tau, mem, lda)
457 :
458 : use elpa_abstract_impl
459 : use precision
460 : implicit none
461 : #include "../general/precision_kinds.F90"
462 : class(elpa_abstract_impl_t), intent(inout) :: obj
463 : integer(kind=ik), intent(in) :: n !length of householder-vectors
464 : integer(kind=ik), intent(in) :: nb !number of householder-vectors
465 : integer(kind=ik), intent(in) :: lda !leading dimension of Y and W
466 : real(kind=rk), intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b
467 : real(kind=rk), intent(in) :: tau(nb) !tau values
468 : real(kind=rk), intent(out) :: W(lda,nb) !output matrix W
469 : real(kind=rk), intent(in) :: mem(nb) !memory for a temporary matrix of size nb
470 :
471 : integer(kind=ik) :: i
472 :
473 0 : call obj%timer%start("wy_gen" // PRECISION_SUFFIX)
474 :
475 0 : W(1:n,1) = tau(1)*Y(1:n,1)
476 0 : do i=2,nb
477 0 : W(1:n,i) = tau(i)*Y(1:n,i)
478 0 : call obj%timer%start("blas")
479 0 : call PRECISION_GEMV('T', n, i-1, 1.0_rk, Y, lda, W(1,i), 1, 0.0_rk, mem,1)
480 0 : call PRECISION_GEMV('N', n, i-1, -1.0_rk, W, lda, mem, 1, 1.0_rk, W(1,i),1)
481 0 : call obj%timer%stop("blas")
482 : enddo
483 0 : call obj%timer%stop("wy_gen" // PRECISION_SUFFIX)
484 0 : end subroutine
485 :
486 : subroutine wy_left_&
487 0 : &PRECISION&
488 0 : &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
489 :
490 : use precision
491 : use elpa_abstract_impl
492 : implicit none
493 : #include "../general/precision_kinds.F90"
494 : class(elpa_abstract_impl_t), intent(inout) :: obj
495 : integer(kind=ik), intent(in) :: n !width of the matrix A
496 : integer(kind=ik), intent(in) :: m !length of matrix W and Y
497 : integer(kind=ik), intent(in) :: nb !width of matrix W and Y
498 : integer(kind=ik), intent(in) :: lda !leading dimension of A
499 : integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
500 : real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
501 : real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W
502 : real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y
503 : real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
504 :
505 0 : call obj%timer%start("wy_left" // PRECISION_SUFFIX)
506 0 : call obj%timer%start("blas")
507 0 : call PRECISION_GEMM('T', 'N', nb, n, m, 1.0_rk, W, lda2, A, lda, 0.0_rk, mem, nb)
508 0 : call PRECISION_GEMM('N', 'N', m, n, nb, -1.0_rk, Y, lda2, mem, nb, 1.0_rk, A, lda)
509 0 : call obj%timer%stop("blas")
510 0 : call obj%timer%stop("wy_left" // PRECISION_SUFFIX)
511 0 : end subroutine
512 :
513 : subroutine wy_right_&
514 0 : &PRECISION&
515 0 : &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
516 :
517 : use precision
518 : use elpa_abstract_impl
519 : implicit none
520 : #include "../general/precision_kinds.F90"
521 : class(elpa_abstract_impl_t), intent(inout) :: obj
522 : integer(kind=ik), intent(in) :: n !height of the matrix A
523 : integer(kind=ik), intent(in) :: m !length of matrix W and Y
524 : integer(kind=ik), intent(in) :: nb !width of matrix W and Y
525 : integer(kind=ik), intent(in) :: lda !leading dimension of A
526 : integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
527 : real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
528 : real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W
529 : real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y
530 : real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
531 :
532 :
533 0 : call obj%timer%start("wy_right" // PRECISION_SUFFIX)
534 0 : call obj%timer%start("blas")
535 0 : call PRECISION_GEMM('N', 'N', n, nb, m, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n)
536 0 : call PRECISION_GEMM('N', 'T', n, m, nb, -1.0_rk, mem, n, Y, lda2, 1.0_rk, A, lda)
537 0 : call obj%timer%stop("blas")
538 0 : call obj%timer%stop("wy_right" // PRECISION_SUFFIX)
539 :
540 0 : end subroutine
541 :
542 : subroutine wy_symm_&
543 0 : &PRECISION&
544 0 : &(obj, n, nb, A, lda, W, Y, mem, mem2, lda2)
545 :
546 : use elpa_abstract_impl
547 : use precision
548 : implicit none
549 : #include "../general/precision_kinds.F90"
550 : class(elpa_abstract_impl_t), intent(inout) :: obj
551 : integer(kind=ik), intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y
552 : integer(kind=ik), intent(in) :: nb !width of matrix W and Y
553 : integer(kind=ik), intent(in) :: lda !leading dimension of A
554 : integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
555 : real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
556 : real(kind=rk), intent(in) :: W(n,nb) !blocked transformation matrix W
557 : real(kind=rk), intent(in) :: Y(n,nb) !blocked transformation matrix Y
558 : real(kind=rk) :: mem(n,nb) !memory for a temporary matrix of size n x nb
559 : real(kind=rk) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
560 :
561 0 : call obj%timer%start("wy_symm" // PRECISION_SUFFIX)
562 0 : call obj%timer%start("blas")
563 0 : call PRECISION_SYMM('L', 'L', n, nb, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n)
564 0 : call PRECISION_GEMM('T', 'N', nb, nb, n, 1.0_rk, mem, n, W, lda2, 0.0_rk, mem2, nb)
565 0 : call PRECISION_GEMM('N', 'N', n, nb, nb, -0.5_rk, Y, lda2, mem2, nb, 1.0_rk, mem, n)
566 0 : call PRECISION_SYR2K('L', 'N', n, nb, -1.0_rk, Y, lda2, mem, n, 1.0_rk, A, lda)
567 0 : call obj%timer%stop("blas")
568 0 : call obj%timer%stop("wy_symm" // PRECISION_SUFFIX)
569 :
570 0 : end subroutine
571 :
|