Line data Source code
1 : ! This file is part of ELPA.
2 : !
3 : ! The ELPA library was originally created by the ELPA consortium,
4 : ! consisting of the following organizations:
5 : !
6 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : !
19 : ! More information can be found here:
20 : ! http://elpa.mpcdf.mpg.de/
21 : !
22 : ! ELPA is free software: you can redistribute it and/or modify
23 : ! it under the terms of the version 3 of the license of the
24 : ! GNU Lesser General Public License as published by the Free
25 : ! Software Foundation.
26 : !
27 : ! ELPA is distributed in the hope that it will be useful,
28 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
29 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 : ! GNU Lesser General Public License for more details.
31 : !
32 : ! You should have received a copy of the GNU Lesser General Public License
33 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
34 : !
35 : ! ELPA reflects a substantial effort on the part of the original
36 : ! ELPA consortium, and we ask you to respect the spirit of the
37 : ! license that we chose: i.e., please contribute any changes you
38 : ! may have back to the original ELPA library distribution, and keep
39 : ! any derivatives of ELPA under the same license that we chose for
40 : ! the original distribution, the GNU Lesser General Public License.
41 : !
42 : subroutine qr_pdgeqrf_2dcomm_&
43 0 : &PRECISION &
44 0 : (obj, a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, &
45 0 : work, workLength, lwork, m, n, mb, nb, rowidx, colidx, &
46 : rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic)
47 : use precision
48 : use elpa1_impl
49 : use qr_utils_mod
50 : use elpa_abstract_impl
51 : implicit none
52 :
53 : class(elpa_abstract_impl_t), intent(inout) :: obj
54 : ! parameter setup
55 : INTEGER(kind=ik), parameter :: gmode_ = 1, rank_ = 2, eps_ = 3
56 :
57 : ! input variables (local)
58 : integer(kind=ik), intent(in) :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, &
59 : colsT, workLength
60 :
61 : ! input variables (global)
62 : integer(kind=ik) :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows
63 : #ifdef USE_ASSUMED_SIZE_QR
64 : integer(kind=ik) :: PQRPARAM(*)
65 : real(kind=C_DATATYPE_KIND) :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*)
66 : #else
67 : integer(kind=ik) :: PQRPARAM(1:11)
68 : real(kind=C_DATATYPE_KIND) :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), &
69 : t(1:ldt,1:colsT), work(1:workLength)
70 : #endif
71 : ! output variables (global)
72 : real(kind=C_DATATYPE_KIND) :: blockheuristic(*)
73 :
74 : ! input variables derived from PQRPARAM
75 : integer(kind=ik) :: updatemode,tmerge,size2d
76 :
77 : ! local scalars
78 : integer(kind=ik) :: mpierr,mpirank_cols,broadcast_size,mpirank_rows
79 : integer(kind=ik) :: mpirank_cols_qr,mpiprocs_cols
80 : integer(kind=ik) :: lcols_temp,lcols,icol,lastcol
81 : integer(kind=ik) :: baseoffset,offset,idx,voffset
82 : integer(kind=ik) :: update_voffset,update_tauoffset
83 : integer(kind=ik) :: update_lcols
84 : integer(kind=ik) :: work_offset
85 :
86 : real(kind=C_DATATYPE_KIND) :: dbroadcast_size(1),dtmat_bcast_size(1)
87 : real(kind=C_DATATYPE_KIND) :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
88 : integer(kind=ik) :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
89 : integer(kind=ik) :: remaining_cols
90 : integer(kind=ik) :: total_cols
91 : integer(kind=ik) :: incremental_update_size ! needed for incremental update mode
92 :
93 : call obj%timer%start("qr_pdgeqrf_2dcomm_&
94 : &PRECISION&
95 0 : &")
96 0 : size2d = PQRPARAM(1)
97 0 : updatemode = PQRPARAM(2)
98 0 : tmerge = PQRPARAM(3)
99 :
100 : ! copy value before we are going to filter it
101 0 : total_cols = n
102 0 : call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
103 0 : call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
104 0 : call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
105 :
106 : #ifdef USE_ASSUMED_SIZE_QR
107 : call qr_pdgeqrf_1dcomm_&
108 : &PRECISION &
109 : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
110 : PQRPARAM(4),mpicomm_rows,blockheuristic)
111 : #else
112 : call qr_pdgeqrf_1dcomm_&
113 : &PRECISION &
114 : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
115 0 : PQRPARAM(4:11),mpicomm_rows,blockheuristic)
116 : #endif
117 : call qr_pdgeqrf_pack_unpack_&
118 : &PRECISION &
119 0 : (obj,v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
120 : call qr_pdgeqrf_pack_unpack_tmatrix_&
121 : &PRECISION &
122 0 : (obj,tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
123 :
124 : #ifdef DOUBLE_PRECISION_REAL
125 0 : pdlarft_size(1) = 0.0_rk8
126 : #else
127 0 : pdlarft_size(1) = 0.0_rk4
128 : #endif
129 :
130 : call qr_pdlarfb_1dcomm_&
131 : &PRECISION &
132 : (m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
133 0 : pdlarfb_size(1),-1)
134 : call qr_tmerge_pdlarfb_1dcomm_&
135 : &PRECISION &
136 : (m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
137 0 : mpicomm_rows,tmerge_pdlarfb_size(1),-1)
138 :
139 :
140 0 : temptau_offset = 1
141 0 : temptau_size = total_cols
142 0 : broadcast_offset = temptau_offset + temptau_size
143 0 : broadcast_size = int(dbroadcast_size(1) + dtmat_bcast_size(1))
144 0 : work_offset = broadcast_offset + broadcast_size
145 :
146 0 : if (lwork .eq. -1) then
147 : #ifdef DOUBLE_PRECISION_REAL
148 : work(1) = (real(temptau_size,kind=C_DATATYPE_KIND) + real(broadcast_size,kind=C_DATATYPE_KIND) + max(pdgeqrf_size(1), &
149 : pdlarft_size(1),pdlarfb_size(1), &
150 0 : tmerge_pdlarfb_size(1)))
151 : #else
152 : work(1) = (real(temptau_size,kind=rk4) + real(broadcast_size,kind=rk4) + max(pdgeqrf_size(1), &
153 : pdlarft_size(1),pdlarfb_size(1), &
154 0 : tmerge_pdlarfb_size(1)))
155 : #endif
156 : call obj%timer%stop("qr_pdgeqrf_2dcomm_&
157 : &PRECISION&
158 0 : &")
159 0 : return
160 : end if
161 :
162 0 : lastcol = colidx-total_cols+1
163 0 : voffset = total_cols
164 :
165 0 : incremental_update_size = 0
166 :
167 : ! clear v buffer: just ensure that there is no junk in the upper triangle
168 : ! part, otherwise pdlarfb gets some problems
169 : ! pdlarfl(2) do not have these problems as they are working more on a Vector
170 : ! basis
171 : #ifdef DOUBLE_PRECISION_REAL
172 0 : v(1:ldv,1:total_cols) = 0.0_rk8
173 : #else
174 0 : v(1:ldv,1:total_cols) = 0.0_rk4
175 : #endif
176 0 : icol = colidx
177 :
178 0 : remaining_cols = total_cols
179 :
180 : !print *,'start decomposition',m,rowidx,colidx
181 :
182 0 : do while (remaining_cols .gt. 0)
183 :
184 : ! determine rank of process column with next qr block
185 0 : mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)
186 :
187 : ! lcols can't be larger than than nb
188 : ! exception: there is only one process column
189 :
190 : ! however, we might not start at the first local column.
191 : ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
192 : ! determine the real amount of local columns
193 0 : lcols_temp = min(nb,(icol-lastcol+1))
194 :
195 : ! blocking parameter
196 0 : lcols_temp = max(min(lcols_temp,size2d),1)
197 :
198 : ! determine size from last decomposition column
199 : ! to first decomposition column
200 : call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
201 : mpirank_cols_qr,mpiprocs_cols, &
202 0 : lcols,baseoffset,offset)
203 :
204 0 : voffset = remaining_cols - lcols + 1
205 :
206 0 : idx = rowidx - colidx + icol
207 :
208 0 : if (mpirank_cols .eq. mpirank_cols_qr) then
209 : ! qr decomposition part
210 : #ifdef DOUBLE_PRECISION_REAL
211 0 : tau(offset:offset+lcols-1) = 0.0_rk8
212 : #else
213 0 : tau(offset:offset+lcols-1) = 0.0_rk4
214 : #endif
215 :
216 : #ifdef USE_ASSUMED_SIZE_QR
217 : call qr_pdgeqrf_1dcomm_&
218 : &PRECISION &
219 : (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
220 : work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
221 : mpicomm_rows,blockheuristic)
222 :
223 : #else
224 : call qr_pdgeqrf_1dcomm_&
225 : &PRECISION &
226 : (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
227 : work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), &
228 0 : mpicomm_rows,blockheuristic)
229 : #endif
230 :
231 : ! pack broadcast buffer (v + tau)
232 : call qr_pdgeqrf_pack_unpack_&
233 : &PRECISION &
234 : (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
235 0 : idx,rev,0,mpicomm_rows)
236 :
237 : ! determine broadcast size
238 : call qr_pdgeqrf_pack_unpack_&
239 : &PRECISION &
240 : (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
241 0 : 0,mpicomm_rows)
242 0 : broadcast_size = int(dbroadcast_size(1))
243 :
244 : !if (mpirank_rows .eq. 0) then
245 : ! pack tmatrix into broadcast buffer and calculate new size
246 : call qr_pdgeqrf_pack_unpack_tmatrix_&
247 : &PRECISION &
248 : (obj,tau(offset),t(voffset,voffset),ldt, &
249 0 : work(broadcast_offset+broadcast_size),lwork,lcols,0)
250 : call qr_pdgeqrf_pack_unpack_tmatrix_&
251 : &PRECISION &
252 0 : (obj,tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
253 0 : broadcast_size = broadcast_size + int(dtmat_bcast_size(1))
254 : !end if
255 :
256 : ! initiate broadcast (send part)
257 : #ifdef WITH_MPI
258 :
259 : #ifdef DOUBLE_PRECISION_REAL
260 : call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
261 0 : mpirank_cols_qr,mpicomm_cols,mpierr)
262 : #else
263 : call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real4, &
264 0 : mpirank_cols_qr,mpicomm_cols,mpierr)
265 : #endif
266 :
267 : #endif
268 : ! copy tau parts into temporary tau buffer
269 0 : work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
270 :
271 : !print *,'generated tau:', tau(offset)
272 : else
273 : ! Vector exchange part
274 :
275 : ! determine broadcast size
276 : call qr_pdgeqrf_pack_unpack_&
277 : &PRECISION &
278 0 : (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
279 0 : broadcast_size = int(dbroadcast_size(1))
280 :
281 : call qr_pdgeqrf_pack_unpack_tmatrix_&
282 : &PRECISION &
283 : (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
284 0 : dtmat_bcast_size(1),-1,lcols,0)
285 0 : tmat_bcast_size = dtmat_bcast_size(1)
286 :
287 : !print *,'broadcast_size (nonqr)',broadcast_size
288 0 : broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
289 :
290 : ! initiate broadcast (recv part)
291 : #ifdef WITH_MPI
292 :
293 : #ifdef DOUBLE_PRECISION_REAL
294 : call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
295 0 : mpirank_cols_qr,mpicomm_cols,mpierr)
296 : #else
297 : call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real4, &
298 0 : mpirank_cols_qr,mpicomm_cols,mpierr)
299 : #endif
300 :
301 : #endif
302 : ! last n*n elements in buffer are (still empty) T matrix elements
303 : ! fetch from first process in each column
304 :
305 : ! unpack broadcast buffer (v + tau)
306 : call qr_pdgeqrf_pack_unpack_&
307 : &PRECISION &
308 : (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols, &
309 0 : mb,rowidx,idx,rev,1,mpicomm_rows)
310 :
311 : ! now send t matrix to other processes in our process column
312 0 : broadcast_size = int(dbroadcast_size(1))
313 0 : tmat_bcast_size = int(dtmat_bcast_size(1))
314 :
315 : ! t matrix should now be available on all processes => unpack
316 : call qr_pdgeqrf_pack_unpack_tmatrix_&
317 : &PRECISION &
318 : (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
319 0 : work(broadcast_offset+broadcast_size),lwork,lcols,1)
320 : end if
321 :
322 0 : remaining_cols = remaining_cols - lcols
323 :
324 : ! apply householder vectors to whole trailing matrix parts (if any)
325 :
326 0 : update_voffset = voffset
327 0 : update_tauoffset = icol
328 0 : update_lcols = lcols
329 0 : incremental_update_size = incremental_update_size + lcols
330 :
331 0 : icol = icol - lcols
332 : ! count colums from first column of global block to current index
333 : call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
334 : mpirank_cols,mpiprocs_cols, &
335 0 : lcols,baseoffset,offset)
336 :
337 0 : if (lcols .gt. 0) then
338 :
339 : !print *,'updating trailing matrix'
340 :
341 0 : if (updatemode .eq. ichar('I')) then
342 0 : print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
343 0 : else if (updatemode .eq. ichar('F')) then
344 : ! full update no merging
345 : call qr_pdlarfb_1dcomm_&
346 : &PRECISION &
347 : (m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
348 : work(temptau_offset+update_voffset-1), &
349 : t(update_voffset,update_voffset),ldt, &
350 0 : rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
351 : else
352 : ! full update + merging default
353 : call qr_tmerge_pdlarfb_1dcomm_&
354 : &PRECISION &
355 : (m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
356 : v(1,update_voffset),ldv, &
357 : t(update_voffset,update_voffset),ldt, &
358 : a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
359 0 : work(work_offset),lwork)
360 : end if
361 : else
362 0 : if (updatemode .eq. ichar('I')) then
363 : !print *,'sole merging of (incremental) T matrix', mpirank_cols, &
364 : ! n-(update_voffset+incremental_update_size-1)
365 : call qr_tmerge_pdlarfb_1dcomm_&
366 : &PRECISION &
367 : (m,mb,0,n-(update_voffset+incremental_update_size-1), &
368 : incremental_update_size,v(1,update_voffset),ldv, &
369 : t(update_voffset,update_voffset),ldt, &
370 0 : a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
371 :
372 : ! reset for upcoming incremental updates
373 0 : incremental_update_size = 0
374 0 : else if (updatemode .eq. ichar('M')) then
375 : ! final merge
376 : call qr_tmerge_pdlarfb_1dcomm_&
377 : &PRECISION &
378 : (m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
379 : v(1,update_voffset),ldv, &
380 : t(update_voffset,update_voffset),ldt, &
381 0 : a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
382 : else
383 : ! full updatemode - nothing to update
384 : end if
385 :
386 : ! reset for upcoming incremental updates
387 0 : incremental_update_size = 0
388 : end if
389 : end do
390 :
391 0 : if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
392 : ! finally merge all small T parts
393 : call qr_pdlarft_tree_merge_1dcomm_&
394 : &PRECISION &
395 0 : (m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
396 : end if
397 :
398 : !print *,'stop decomposition',rowidx,colidx
399 : call obj%timer%stop("qr_pdgeqrf_2dcomm_&
400 : &PRECISION&
401 0 : &")
402 : end subroutine
403 :
404 : subroutine qr_pdgeqrf_1dcomm_&
405 0 : &PRECISION &
406 0 : (obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans, &
407 0 : PQRPARAM,mpicomm,blockheuristic)
408 : use precision
409 : use elpa1_impl
410 : use elpa_abstract_impl
411 : implicit none
412 :
413 : class(elpa_abstract_impl_t), intent(inout) :: obj
414 : ! parameter setup
415 : INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3
416 :
417 : ! input variables (local)
418 : integer(kind=ik) :: lda,lwork,ldv,ldt
419 : real(kind=C_DATATYPE_KIND) :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
420 :
421 : ! input variables (global)
422 : integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
423 : #ifdef USE_ASSUMED_SIZE_QR
424 : integer(kind=ik) :: PQRPARAM(*)
425 :
426 : #else
427 : integer(kind=ik) :: PQRPARAM(:)
428 : #endif
429 : ! derived input variables
430 :
431 : ! derived further input variables from QR_PQRPARAM
432 : integer(kind=ik) :: size1d,updatemode,tmerge
433 :
434 : ! output variables (global)
435 : real(kind=C_DATATYPE_KIND) :: blockheuristic(*)
436 :
437 : ! local scalars
438 : integer(kind=ik) :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
439 : real(kind=C_DATATYPE_KIND) :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
440 : call obj%timer%start("qr_pdgeqrf_1dcomm_&
441 : &PRECISION&
442 0 : &")
443 0 : size1d = max(min(PQRPARAM(1),n),1)
444 0 : updatemode = PQRPARAM(2)
445 0 : tmerge = PQRPARAM(3)
446 :
447 0 : if (lwork .eq. -1) then
448 : #ifdef USE_ASSUMED_SIZE_QR
449 : call qr_pdgeqr2_1dcomm_&
450 : &PRECISION &
451 : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
452 : m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)
453 : #else
454 : call qr_pdgeqr2_1dcomm_&
455 : &PRECISION &
456 : (obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
457 0 : m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic)
458 : #endif
459 : ! reserve more space for incremental mode
460 : call qr_tmerge_pdlarfb_1dcomm_&
461 : &PRECISION &
462 : (m,mb,n,n,n,v,ldv,t,ldt, &
463 0 : a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
464 :
465 : call qr_pdlarft_tree_merge_1dcomm_&
466 : &PRECISION &
467 0 : (m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)
468 :
469 0 : work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
470 : call obj%timer%stop("qr_pdgeqrf_1dcomm_&
471 : &PRECISION&
472 0 : &")
473 0 : return
474 : end if
475 :
476 0 : nr_blocks = n / size1d
477 0 : remainder = n - nr_blocks*size1d
478 :
479 0 : current_block = 0
480 0 : do while (current_block .lt. nr_blocks)
481 0 : idx = rowidx-current_block*size1d
482 0 : updatesize = n-(current_block+1)*size1d
483 0 : aoffset = 1+updatesize
484 : #ifdef USE_ASSUMED_SIZE_QR
485 : call qr_pdgeqr2_1dcomm_&
486 : &PRECISION &
487 : (obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
488 : m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
489 :
490 : #else
491 : call qr_pdgeqr2_1dcomm_&
492 : &PRECISION &
493 : (obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
494 0 : m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
495 : #endif
496 0 : if (updatemode .eq. ichar('M')) then
497 : ! full update + merging
498 : call qr_tmerge_pdlarfb_1dcomm_&
499 : &PRECISION &
500 : (m,mb,updatesize,current_block*size1d,size1d, &
501 : v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
502 0 : a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
503 0 : else if (updatemode .eq. ichar('I')) then
504 0 : if (updatesize .ge. size1d) then
505 : ! incremental update + merging
506 : call qr_tmerge_pdlarfb_1dcomm_&
507 : &PRECISION &
508 : (m,mb,size1d,current_block*size1d,size1d, &
509 : v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
510 0 : a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)
511 :
512 : else ! only remainder left
513 : ! incremental update + merging
514 : call qr_tmerge_pdlarfb_1dcomm_&
515 : &PRECISION &
516 : (m,mb,remainder,current_block*size1d,size1d, &
517 : v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
518 0 : a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
519 : end if
520 : else ! full update no merging is default
521 : ! full update no merging
522 : call qr_pdlarfb_1dcomm_&
523 : &PRECISION &
524 : (m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
525 0 : tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
526 : end if
527 :
528 : ! move on to next block
529 0 : current_block = current_block+1
530 : end do
531 :
532 0 : if (remainder .gt. 0) then
533 0 : aoffset = 1
534 0 : idx = rowidx-size1d*nr_blocks
535 : #ifdef USE_ASSUMED_SIZE_QR
536 : call qr_pdgeqr2_1dcomm_&
537 : &PRECISION &
538 : (obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
539 : m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
540 :
541 : #else
542 : call qr_pdgeqr2_1dcomm_&
543 : &PRECISION &
544 : (obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
545 0 : m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
546 : #endif
547 0 : if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
548 : ! final merging
549 : call qr_tmerge_pdlarfb_1dcomm_&
550 : &PRECISION &
551 : (m,mb,0,size1d*nr_blocks,remainder, &
552 : v,ldv,t,ldt, &
553 0 : a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
554 : end if
555 : end if
556 :
557 0 : if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
558 : ! finally merge all small T parts
559 : call qr_pdlarft_tree_merge_1dcomm_&
560 : &PRECISION &
561 0 : (m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
562 : end if
563 : call obj%timer%stop("qr_pdgeqrf_1dcomm_&
564 : &PRECISION&
565 0 : &")
566 :
567 : end subroutine
568 :
569 : ! local a and tau are assumed to be positioned at the right column from a local
570 : ! perspective
571 : ! TODO: if local amount of data turns to zero the algorithm might produce wrong
572 : ! results (probably due to old buffer contents)
573 : subroutine qr_pdgeqr2_1dcomm_&
574 0 : &PRECISION &
575 0 : (obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev, &
576 0 : trans,PQRPARAM,mpicomm,blockheuristic)
577 : use precision
578 : !use elpa1_impl ! check this
579 : use elpa_abstract_impl
580 : implicit none
581 :
582 : class(elpa_abstract_impl_t), intent(inout) :: obj
583 : ! parameter setup
584 : INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
585 :
586 : ! input variables (local)
587 : integer(kind=ik) :: lda,lwork,ldv,ldt
588 : real(kind=C_DATATYPE_KIND) :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
589 :
590 : ! input variables (global)
591 : integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
592 : #ifdef USE_ASSUMED_SIZE_QR
593 : integer(kind=ik) :: PQRPARAM(*)
594 : #else
595 : integer(kind=ik) :: PQRPARAM(:)
596 : #endif
597 : ! output variables (global)
598 : real(kind=C_DATATYPE_KIND) :: blockheuristic(*)
599 :
600 : ! derived further input variables from QR_PQRPARAM
601 : integer(kind=ik) :: maxrank,hgmode,updatemode
602 :
603 : ! local scalars
604 : integer(kind=ik) :: icol,incx,idx
605 : real(kind=C_DATATYPE_KIND) :: pdlarfg_size(1),pdlarf_size(1),total_size
606 : real(kind=C_DATATYPE_KIND) :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
607 : real(kind=C_DATATYPE_KIND) :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
608 : integer(kind=ik) :: mpirank,mpiprocs,mpierr
609 : integer(kind=ik) :: rank,lastcol,actualrank,nextrank
610 : integer(kind=ik) :: update_cols,decomposition_cols
611 : integer(kind=ik) :: current_column
612 : call obj%timer%start("qr_pdgeqr2_1dcomm_&
613 : &PRECISION&
614 0 : &")
615 :
616 0 : maxrank = min(PQRPARAM(1),n)
617 0 : updatemode = PQRPARAM(2)
618 0 : hgmode = PQRPARAM(4)
619 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
620 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
621 0 : if (trans .eq. 1) then
622 0 : incx = lda
623 : else
624 0 : incx = 1
625 : end if
626 :
627 0 : if (lwork .eq. -1) then
628 :
629 : call qr_pdlarfg_1dcomm_&
630 : &PRECISION &
631 0 : (obj,a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
632 : call qr_pdlarfl_1dcomm_&
633 : &PRECISION &
634 0 : (v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
635 : #ifdef USE_ASSUMED_SIZE_QR
636 : call qr_pdlarfg2_1dcomm_ref_&
637 : &PRECISION &
638 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, &
639 : rev,mpicomm,actualrank)
640 :
641 : call qr_pdlarfgk_1dcomm_&
642 : &PRECISION &
643 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
644 :
645 : #else
646 : call qr_pdlarfg2_1dcomm_ref_&
647 : &PRECISION &
648 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), &
649 0 : rev,mpicomm,actualrank)
650 :
651 : call qr_pdlarfgk_1dcomm_&
652 : &PRECISION &
653 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n, &
654 0 : rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank)
655 : #endif
656 : call qr_pdlarfl2_tmatrix_1dcomm_&
657 : &PRECISION &
658 0 : (v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
659 : #ifdef DOUBLE_PRECISION_REAL
660 0 : pdlarft_size(1) = 0.0_rk8
661 : #else
662 0 : pdlarft_size(1) = 0.0_rk4
663 : #endif
664 : call qr_pdlarfb_1dcomm_&
665 : &PRECISION &
666 0 : (m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
667 : #ifdef DOUBLE_PRECISION_REAL
668 0 : pdlarft_pdlarfb_size(1) = 0.0_rk8
669 : #else
670 0 : pdlarft_pdlarfb_size(1) = 0.0_rk4
671 : #endif
672 : call qr_tmerge_pdlarfb_1dcomm_&
673 : &PRECISION &
674 : (m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,&
675 0 : updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)
676 :
677 : total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
678 0 : pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))
679 :
680 0 : work(1) = total_size
681 : call obj%timer%stop("qr_pdgeqr2_1dcomm_&
682 : &PRECISION&
683 0 : &")
684 0 : return
685 : end if
686 :
687 0 : icol = 1
688 0 : lastcol = min(rowidx,n)
689 0 : decomposition_cols = lastcol
690 0 : update_cols = n
691 0 : do while (decomposition_cols .gt. 0) ! local qr block
692 0 : icol = lastcol-decomposition_cols+1
693 0 : idx = rowidx-icol+1
694 :
695 : ! get possible rank size
696 : ! limited by number of columns and remaining rows
697 0 : rank = min(n-icol+1,maxrank,idx)
698 :
699 0 : current_column = n-icol+1-rank+1
700 :
701 0 : if (rank .eq. 1) then
702 :
703 : call qr_pdlarfg_1dcomm_&
704 : &PRECISION &
705 : (obj,a(1,current_column),incx, &
706 : tau(current_column),work,lwork, &
707 0 : m,idx,mb,hgmode,1,mpicomm)
708 : #ifdef DOUBLE_PRECISION_REAL
709 0 : v(1:ldv,current_column) = 0.0_rk8
710 : #else
711 0 : v(1:ldv,current_column) = 0.0_rk4
712 : #endif
713 : call qr_pdlarfg_copy_1dcomm_&
714 : &PRECISION &
715 : (obj,a(1,current_column),incx, &
716 : v(1,current_column),1, &
717 0 : m,baseidx,idx,mb,1,mpicomm)
718 :
719 : ! initialize t matrix part
720 0 : t(current_column,current_column) = tau(current_column)
721 :
722 0 : actualrank = 1
723 :
724 0 : else if (rank .eq. 2) then
725 : #ifdef USE_ASSUMED_SIZE_QR
726 : call qr_pdlarfg2_1dcomm_ref_&
727 : &PRECISION &
728 : (obj,a(1,current_column),lda,tau(current_column), &
729 : t(current_column,current_column),ldt,v(1,current_column),ldv, &
730 : baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)
731 :
732 : #else
733 : call qr_pdlarfg2_1dcomm_ref_&
734 : &PRECISION &
735 : (obj,a(1,current_column),lda,tau(current_column), &
736 : t(current_column,current_column),ldt,v(1,current_column),ldv, &
737 0 : baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
738 : #endif
739 : else
740 : #ifdef USE_ASSUMED_SIZE_QR
741 : call qr_pdlarfgk_1dcomm_&
742 : &PRECISION &
743 : (obj,a(1,current_column),lda,tau(current_column), &
744 : t(current_column,current_column),ldt,v(1,current_column),ldv, &
745 : baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
746 :
747 : #else
748 : call qr_pdlarfgk_1dcomm_&
749 : &PRECISION &
750 : (obj,a(1,current_column),lda,tau(current_column), &
751 : t(current_column,current_column),ldt,v(1,current_column),ldv, &
752 0 : baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
753 : #endif
754 : end if
755 :
756 0 : blockheuristic(actualrank) = blockheuristic(actualrank) + 1
757 :
758 : ! the blocked decomposition versions already updated their non
759 : ! decomposed parts using their information after communication
760 0 : update_cols = decomposition_cols - rank
761 0 : decomposition_cols = decomposition_cols - actualrank
762 :
763 : ! needed for incremental update
764 0 : nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)
765 :
766 0 : if (current_column .gt. 1) then
767 0 : idx = rowidx-icol+1
768 :
769 0 : if (updatemode .eq. ichar('I')) then
770 : ! incremental update + merging
771 : call qr_tmerge_pdlarfb_1dcomm_&
772 : &PRECISION &
773 : (m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
774 : v(1,current_column+(rank-actualrank)),ldv, &
775 : t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
776 : a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
777 0 : mpicomm,work,lwork)
778 : else
779 : ! full update + merging
780 : call qr_tmerge_pdlarfb_1dcomm_&
781 : &PRECISION &
782 : (m,mb,update_cols,n-(current_column+rank-1),actualrank, &
783 : v(1,current_column+(rank-actualrank)),ldv, &
784 : t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
785 0 : a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
786 : end if
787 : else
788 : call qr_tmerge_pdlarfb_1dcomm_&
789 : &PRECISION &
790 : (m,mb,0,n-(current_column+rank-1),actualrank, &
791 : v(1,current_column+(rank-actualrank)), &
792 : ldv, &
793 : t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
794 0 : a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
795 : end if
796 :
797 : end do
798 : call obj%timer%stop("qr_pdgeqr2_1dcomm_&
799 : &PRECISION&
800 0 : &")
801 : end subroutine
802 :
803 : ! incx == 1: column major
804 : ! incx != 1: row major
805 : subroutine qr_pdlarfg_1dcomm_&
806 0 : &PRECISION &
807 : (obj,x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,communicator)
808 :
809 : use precision
810 : !use elpa1_impl !check this
811 : use qr_utils_mod
812 : use elpa_abstract_impl
813 : implicit none
814 :
815 : class(elpa_abstract_impl_t), intent(inout) :: obj
816 : ! parameter setup
817 : INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2, eps_ = 3
818 :
819 : ! input variables (local)
820 : integer(kind=ik) :: incx,lwork,hgmode
821 : real(kind=C_DATATYPE_KIND) :: x(*),work(*)
822 :
823 : ! input variables (global)
824 : integer(kind=ik) :: communicator,nb,idx,n,rev
825 :
826 : ! output variables (global)
827 : real(kind=C_DATATYPE_KIND) :: tau
828 :
829 : ! local scalars
830 : integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top
831 : integer(kind=ik) :: sendsize,recvsize
832 : integer(kind=ik) :: local_size,local_offset,baseoffset
833 : integer(kind=ik) :: topidx,top,iproc
834 : real(kind=C_DATATYPE_KIND) :: alpha,xnorm,dot,xf
835 :
836 : ! external functions
837 : #ifdef DOUBLE_PRECISION_REAL
838 : real(kind=C_DATATYPE_KIND), external :: ddot,dlapy2,dnrm2
839 : #else
840 : real(kind=C_DATATYPE_KIND), external :: sdot,slapy2,snrm2
841 : #endif
842 : external :: dscal
843 :
844 : ! intrinsic
845 : ! intrinsic sign
846 : call obj%timer%start("qr_pdlarfg_1dcomm_&
847 : &PRECISION&
848 0 : &")
849 0 : if (idx .le. 1) then
850 0 : tau = 0.0d0
851 : call obj%timer%stop("qr_pdlarfg_1dcomm_&
852 : &PRECISION&
853 0 : &")
854 0 : return
855 : end if
856 0 : call MPI_Comm_rank(communicator, mpirank, mpierr)
857 0 : call MPI_Comm_size(communicator, mpiprocs, mpierr)
858 : ! calculate expected work size and store in work(1)
859 0 : if (hgmode .eq. ichar('s')) then
860 : ! allreduce (MPI_SUM)
861 0 : sendsize = 2
862 0 : recvsize = sendsize
863 0 : else if (hgmode .eq. ichar('x')) then
864 : ! alltoall
865 0 : sendsize = mpiprocs*2
866 0 : recvsize = sendsize
867 0 : else if (hgmode .eq. ichar('g')) then
868 : ! allgather
869 0 : sendsize = 2
870 0 : recvsize = mpiprocs*sendsize
871 : else
872 : ! no exchange at all (benchmarking)
873 0 : sendsize = 2
874 0 : recvsize = sendsize
875 : end if
876 :
877 0 : if (lwork .eq. -1) then
878 : #ifdef DOUBLE_PRECISION_REAL
879 0 : work(1) = real(sendsize + recvsize,kind=C_DATATYPE_KIND)
880 : #else
881 0 : work(1) = real(sendsize + recvsize,kind=rk4)
882 : #endif
883 :
884 : call obj%timer%stop("qr_pdlarfg_1dcomm_&
885 : &PRECISION&
886 0 : &")
887 0 : return
888 : end if
889 :
890 : ! Processor id for global index of top element
891 0 : mpirank_top = MOD((idx-1)/nb,mpiprocs)
892 0 : if (mpirank .eq. mpirank_top) then
893 0 : topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
894 0 : top = 1+(topidx-1)*incx
895 : end if
896 :
897 : call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
898 0 : local_size,baseoffset,local_offset)
899 :
900 0 : local_offset = local_offset * incx
901 :
902 : ! calculate and exchange information
903 0 : if (hgmode .eq. ichar('s')) then
904 0 : if (mpirank .eq. mpirank_top) then
905 0 : alpha = x(top)
906 : else
907 : #ifdef DOUBLE_PRECISION_REAL
908 0 : alpha = 0.0_rk8
909 : #else
910 0 : alpha = 0.0_rk4
911 : #endif
912 : end if
913 : #ifdef DOUBLE_PRECISION_REAL
914 : dot = ddot(local_size, &
915 : x(local_offset), incx, &
916 0 : x(local_offset), incx)
917 : #else
918 : dot = sdot(local_size, &
919 : x(local_offset), incx, &
920 0 : x(local_offset), incx)
921 : #endif
922 0 : work(1) = alpha
923 0 : work(2) = dot
924 : #ifdef WITH_MPI
925 :
926 : #ifdef DOUBLE_PRECISION_REAL
927 : call mpi_allreduce(work(1),work(sendsize+1), &
928 : sendsize,mpi_real8,mpi_sum, &
929 0 : communicator,mpierr)
930 : #else
931 : call mpi_allreduce(work(1),work(sendsize+1), &
932 : sendsize,mpi_real4,mpi_sum, &
933 0 : communicator,mpierr)
934 : #endif
935 :
936 : #else
937 0 : work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
938 : #endif
939 0 : alpha = work(sendsize+1)
940 0 : xnorm = sqrt(work(sendsize+2))
941 0 : else if (hgmode .eq. ichar('x')) then
942 0 : if (mpirank .eq. mpirank_top) then
943 0 : alpha = x(top)
944 : else
945 : #ifdef DOUBLE_PRECISION_REAL
946 0 : alpha = 0.0_rk8
947 : #else
948 0 : alpha = 0.0_rk4
949 : #endif
950 : end if
951 : #ifdef DOUBLE_PRECISION_REAL
952 0 : xnorm = dnrm2(local_size, x(local_offset), incx)
953 : #else
954 0 : xnorm = snrm2(local_size, x(local_offset), incx)
955 : #endif
956 0 : do iproc=0,mpiprocs-1
957 0 : work(2*iproc+1) = alpha
958 0 : work(2*iproc+2) = xnorm
959 : end do
960 : #ifdef WITH_MPI
961 :
962 : #ifdef DOUBLE_PRECISION_REAL
963 : call mpi_alltoall(work(1),2,mpi_real8, &
964 : work(sendsize+1),2,mpi_real8, &
965 0 : communicator,mpierr)
966 : #else
967 : call mpi_alltoall(work(1),2,mpi_real4, &
968 : work(sendsize+1),2,mpi_real4, &
969 0 : communicator,mpierr)
970 : #endif
971 :
972 : #else
973 0 : work(sendsize+1:sendsize+1+2-1) = work(1:2)
974 : #endif
975 : ! extract alpha value
976 0 : alpha = work(sendsize+1+mpirank_top*2)
977 :
978 : ! copy norm parts of buffer to beginning
979 0 : do iproc=0,mpiprocs-1
980 0 : work(iproc+1) = work(sendsize+1+2*iproc+1)
981 : end do
982 :
983 : #ifdef DOUBLE_PRECISION_REAL
984 0 : xnorm = dnrm2(mpiprocs, work(1), 1)
985 : #else
986 0 : xnorm = snrm2(mpiprocs, work(1), 1)
987 : #endif
988 0 : else if (hgmode .eq. ichar('g')) then
989 0 : if (mpirank .eq. mpirank_top) then
990 0 : alpha = x(top)
991 : else
992 : #ifdef DOUBLE_PRECISION_REAL
993 0 : alpha = 0.0_rk8
994 : #else
995 0 : alpha = 0.0_rk4
996 : #endif
997 : end if
998 : #ifdef DOUBLE_PRECISION_REAL
999 0 : xnorm = dnrm2(local_size, x(local_offset), incx)
1000 : #else
1001 0 : xnorm = snrm2(local_size, x(local_offset), incx)
1002 : #endif
1003 0 : work(1) = alpha
1004 0 : work(2) = xnorm
1005 :
1006 : ! allgather
1007 : #ifdef WITH_MPI
1008 :
1009 : #ifdef DOUBLE_PRECISION_REAL
1010 : call mpi_allgather(work(1),sendsize,mpi_real8, &
1011 : work(sendsize+1),sendsize,mpi_real8, &
1012 0 : communicator,mpierr)
1013 : #else
1014 : call mpi_allgather(work(1),sendsize,mpi_real4, &
1015 : work(sendsize+1),sendsize,mpi_real4, &
1016 0 : communicator,mpierr)
1017 : #endif
1018 :
1019 : #else
1020 0 : work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
1021 : #endif
1022 : ! extract alpha value
1023 0 : alpha = work(sendsize+1+mpirank_top*2)
1024 :
1025 : ! copy norm parts of buffer to beginning
1026 0 : do iproc=0,mpiprocs-1
1027 0 : work(iproc+1) = work(sendsize+1+2*iproc+1)
1028 : end do
1029 : #ifdef DOUBLE_PRECISION_REAL
1030 0 : xnorm = dnrm2(mpiprocs, work(1), 1)
1031 : #else
1032 0 : xnorm = snrm2(mpiprocs, work(1), 1)
1033 : #endif
1034 : else
1035 : ! dnrm2
1036 : #ifdef DOUBLE_PRECISION_REAL
1037 0 : xnorm = dnrm2(local_size, x(local_offset), incx)
1038 : #else
1039 0 : xnorm = snrm2(local_size, x(local_offset), incx)
1040 : #endif
1041 0 : if (mpirank .eq. mpirank_top) then
1042 0 : alpha = x(top)
1043 : else
1044 : #ifdef DOUBLE_PRECISION_REAL
1045 0 : alpha = 0.0_rk8
1046 : #else
1047 0 : alpha = 0.0_rk4
1048 : #endif
1049 : end if
1050 :
1051 : ! no exchange at all (benchmarking)
1052 : #ifdef DOUBLE_PRECISION_REAL
1053 0 : xnorm = 0.0_rk8
1054 : #else
1055 0 : xnorm = 0.0_rk4
1056 : #endif
1057 : end if
1058 :
1059 : !print *,'ref hg:', idx,xnorm,alpha
1060 : !print *,x(1:n)
1061 :
1062 : ! calculate householder information
1063 : #ifdef DOUBLE_PRECISION_REAL
1064 0 : if (xnorm .eq. 0.0_rk8) then
1065 : ! H = I
1066 :
1067 0 : tau = 0.0_rk8
1068 : #else
1069 0 : if (xnorm .eq. 0.0_rk4) then
1070 : ! H = I
1071 :
1072 0 : tau = 0.0_rk4
1073 : #endif
1074 : else
1075 : ! General case
1076 : call hh_transform_real_&
1077 : &PRECISION &
1078 0 : (obj,alpha,xnorm**2,xf,tau, .false.)
1079 0 : if (mpirank .eq. mpirank_top) then
1080 0 : x(top) = alpha
1081 : end if
1082 : #ifdef DOUBLE_PRECISION_REAL
1083 : call dscal(local_size, xf, &
1084 0 : x(local_offset), incx)
1085 : #else
1086 : call sscal(local_size, xf, &
1087 0 : x(local_offset), incx)
1088 : #endif
1089 :
1090 : ! TODO: reimplement norm rescale method of
1091 : ! original PDLARFG using mpi?
1092 :
1093 : end if
1094 :
1095 : ! useful for debugging
1096 : !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
1097 : !print *,x(1:n)
1098 : call obj%timer%stop("qr_pdlarfg_1dcomm_&
1099 : &PRECISION&
1100 0 : &")
1101 : end subroutine
1102 :
1103 : subroutine qr_pdlarfg2_1dcomm_ref_&
1104 0 : &PRECISION &
1105 0 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
1106 : use precision
1107 : use elpa_abstract_impl
1108 : implicit none
1109 :
1110 : class(elpa_abstract_impl_t), intent(inout) :: obj
1111 : ! parameter setup
1112 : INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
1113 : ! input variables (local)
1114 : integer(kind=ik) :: lda,lwork,ldv,ldt
1115 : real(kind=C_DATATYPE_KIND) :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
1116 :
1117 : ! input variables (global)
1118 : integer(kind=ik) :: m,idx,baseidx,mb,rev,mpicomm
1119 : #ifdef USE_ASSUMED_SIZE_QR
1120 : integer(kind=ik) :: PQRPARAM(*)
1121 : #else
1122 : integer(kind=ik) :: PQRPARAM(:)
1123 : #endif
1124 : ! output variables (global)
1125 : integer(kind=ik) :: actualk
1126 :
1127 : ! derived input variables from QR_PQRPARAM
1128 : integer(kind=ik) :: eps
1129 :
1130 : ! local scalars
1131 : real(kind=C_DATATYPE_KIND) :: dseedwork_size(1)
1132 : integer(kind=ik) :: seedwork_size,seed_size
1133 : integer(kind=ik) :: seedwork_offset,seed_offset
1134 : logical :: accurate
1135 : call obj%timer%start("qr_pdlarfg2_1dcomm_&
1136 : &PRECISION&
1137 0 : &")
1138 :
1139 : call qr_pdlarfg2_1dcomm_seed_&
1140 : &PRECISION &
1141 0 : (obj,a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
1142 0 : seedwork_size = int(dseedwork_size(1))
1143 0 : seed_size = seedwork_size
1144 :
1145 0 : if (lwork .eq. -1) then
1146 0 : work(1) = seedwork_size + seed_size
1147 : call obj%timer%stop("qr_pdlarfg2_1dcomm_&
1148 : &PRECISION&
1149 0 : &")
1150 :
1151 0 : return
1152 : end if
1153 :
1154 0 : seedwork_offset = 1
1155 0 : seed_offset = seedwork_offset + seedwork_size
1156 :
1157 0 : eps = PQRPARAM(3)
1158 :
1159 : ! check for border cases (only a 2x2 matrix left)
1160 0 : if (idx .le. 1) then
1161 : #ifdef DOUBLE_PRECISION_REAL
1162 0 : tau(1:2) = 0.0_rk8
1163 0 : t(1:2,1:2) = 0.0_rk8
1164 : #else
1165 0 : tau(1:2) = 0.0_rk4
1166 0 : t(1:2,1:2) = 0.0_rk4
1167 : #endif
1168 :
1169 : call obj%timer%stop("qr_pdlarfg2_1dcomm_&
1170 : &PRECISION&
1171 0 : &")
1172 :
1173 0 : return
1174 : end if
1175 :
1176 : call qr_pdlarfg2_1dcomm_seed_&
1177 : &PRECISION &
1178 0 : (obj,a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)
1179 :
1180 0 : if (eps .gt. 0) then
1181 : accurate = qr_pdlarfg2_1dcomm_check_&
1182 : &PRECISION &
1183 0 : (obj,work(seed_offset),eps)
1184 : else
1185 0 : accurate = .true.
1186 : end if
1187 :
1188 : call qr_pdlarfg2_1dcomm_vector_&
1189 : &PRECISION &
1190 : (obj,a(1,2),1,tau(2),work(seed_offset), &
1191 0 : m,mb,idx,0,1,mpicomm)
1192 :
1193 : call qr_pdlarfg_copy_1dcomm_&
1194 : &PRECISION &
1195 : (obj,a(1,2),1, &
1196 : v(1,2),1, &
1197 0 : m,baseidx,idx,mb,1,mpicomm)
1198 :
1199 : call qr_pdlarfg2_1dcomm_update_&
1200 : &PRECISION &
1201 0 : (obj,v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)
1202 :
1203 : ! check for 2x2 matrix case => only one householder Vector will be
1204 : ! generated
1205 0 : if (idx .gt. 2) then
1206 0 : if (accurate .eqv. .true.) then
1207 : call qr_pdlarfg2_1dcomm_vector_&
1208 : &PRECISION &
1209 : (obj,a(1,1),1,tau(1),work(seed_offset), &
1210 0 : m,mb,idx-1,1,1,mpicomm)
1211 :
1212 : call qr_pdlarfg_copy_1dcomm_&
1213 : &PRECISION &
1214 : (obj,a(1,1),1, &
1215 : v(1,1),1, &
1216 0 : m,baseidx,idx-1,mb,1,mpicomm)
1217 :
1218 : ! generate fuse element
1219 : call qr_pdlarfg2_1dcomm_finalize_tmatrix_&
1220 : &PRECISION &
1221 0 : (obj,work(seed_offset),tau,t,ldt)
1222 :
1223 0 : actualk = 2
1224 : else
1225 : #ifdef DOUBLE_PRECISION_REAL
1226 0 : t(1,1) = 0.0_rk8
1227 0 : t(1,2) = 0.0_rk8
1228 : #else
1229 0 : t(1,1) = 0.0_rk4
1230 0 : t(1,2) = 0.0_rk4
1231 : #endif
1232 0 : t(2,2) = tau(2)
1233 :
1234 0 : actualk = 1
1235 : end if
1236 : else
1237 : #ifdef DOUBLE_PRECISION_REAL
1238 0 : t(1,1) = 0.0_rk8
1239 0 : t(1,2) = 0.0_rk8
1240 : #else
1241 0 : t(1,1) = 0.0_rk4
1242 0 : t(1,2) = 0.0_rk4
1243 : #endif
1244 0 : t(2,2) = tau(2)
1245 :
1246 : ! no more vectors to create
1247 : #ifdef DOUBLE_PRECISION_REAL
1248 0 : tau(1) = 0.0_rk8
1249 : #else
1250 0 : tau(1) = 0.0_rk4
1251 : #endif
1252 :
1253 0 : actualk = 2
1254 :
1255 : !print *,'rank2: no more data'
1256 : end if
1257 : call obj%timer%stop("qr_pdlarfg2_1dcomm_&
1258 : &PRECISION&
1259 0 : &")
1260 :
1261 : end subroutine
1262 :
1263 : subroutine qr_pdlarfg2_1dcomm_seed_&
1264 0 : &PRECISION &
1265 0 : (obj,a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
1266 : use precision
1267 : !use elpa1_impl ! check this
1268 : use qr_utils_mod
1269 : use elpa_abstract_impl
1270 : implicit none
1271 :
1272 : class(elpa_abstract_impl_t), intent(inout) :: obj
1273 : ! input variables (local)
1274 : integer(kind=ik) :: lda,lwork
1275 : real(kind=C_DATATYPE_KIND) :: a(lda,*),work(*),seed(*)
1276 :
1277 : ! input variables (global)
1278 : integer(kind=ik) :: n,nb,idx,rev,mpicomm
1279 :
1280 : ! output variables (global)
1281 :
1282 : ! external functions
1283 : #ifdef DOUBLE_PRECISION_REAL
1284 : real(kind=C_DATATYPE_KIND), external :: ddot
1285 : #else
1286 : real(kind=C_DATATYPE_KIND), external :: sdot
1287 : #endif
1288 : ! local scalars
1289 : real(kind=C_DATATYPE_KIND) :: top11,top21,top12,top22
1290 : real(kind=C_DATATYPE_KIND) :: dot11,dot12,dot22
1291 : integer(kind=ik) :: mpirank,mpiprocs,mpierr
1292 : integer(kind=ik) :: mpirank_top11,mpirank_top21
1293 : integer(kind=ik) :: top11_offset,top21_offset
1294 : integer(kind=ik) :: baseoffset
1295 : integer(kind=ik) :: local_offset1,local_size1
1296 : integer(kind=ik) :: local_offset2,local_size2
1297 :
1298 : call obj%timer%start("qr_pdlarfg2_1dcomm_seed_&
1299 : &PRECISION&
1300 0 : &")
1301 :
1302 0 : if (lwork .eq. -1) then
1303 : #ifdef DOUBLE_PRECISION_REAL
1304 0 : work(1) = 8.0_rk8
1305 : #else
1306 0 : work(1) = 8.0_rk4
1307 : #endif
1308 :
1309 : call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
1310 : &PRECISION&
1311 0 : &")
1312 0 : return
1313 : end if
1314 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
1315 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
1316 : call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
1317 0 : local_size1,baseoffset,local_offset1)
1318 :
1319 : call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
1320 0 : local_size2,baseoffset,local_offset2)
1321 :
1322 0 : mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
1323 0 : mpirank_top21 = MOD((idx-2)/nb,mpiprocs)
1324 :
1325 0 : top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
1326 0 : top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)
1327 :
1328 0 : if (mpirank_top11 .eq. mpirank) then
1329 0 : top11 = a(top11_offset,2)
1330 0 : top12 = a(top11_offset,1)
1331 : else
1332 : #ifdef DOUBLE_PRECISION_REAL
1333 0 : top11 = 0.0_rk8
1334 0 : top12 = 0.0_rk8
1335 : #else
1336 0 : top11 = 0.0_rk4
1337 0 : top12 = 0.0_rk4
1338 : #endif
1339 : end if
1340 :
1341 0 : if (mpirank_top21 .eq. mpirank) then
1342 0 : top21 = a(top21_offset,2)
1343 0 : top22 = a(top21_offset,1)
1344 : else
1345 : #ifdef DOUBLE_PRECISION_REAL
1346 0 : top21 = 0.0_rk8
1347 0 : top22 = 0.0_rk8
1348 : #else
1349 0 : top21 = 0.0_rk4
1350 0 : top22 = 0.0_rk4
1351 : #endif
1352 : end if
1353 :
1354 : ! calculate 3 dot products
1355 : #ifdef DOUBLE_PRECISION_REAL
1356 0 : dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
1357 0 : dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
1358 0 : dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
1359 : #else
1360 0 : dot11 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
1361 0 : dot12 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
1362 0 : dot22 = sdot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
1363 : #endif
1364 : ! store results in work buffer
1365 0 : work(1) = top11
1366 0 : work(2) = dot11
1367 0 : work(3) = top12
1368 0 : work(4) = dot12
1369 0 : work(5) = top21
1370 0 : work(6) = top22
1371 0 : work(7) = dot22
1372 : #ifdef DOUBLE_PRECISION_REAL
1373 0 : work(8) = 0.0_rk8! fill up buffer
1374 : #else
1375 0 : work(8) = 0.0_rk4! fill up buffer
1376 : #endif
1377 : ! exchange partial results
1378 : #ifdef WITH_MPI
1379 :
1380 : #ifdef DOUBLE_PRECISION_REAL
1381 : call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
1382 0 : mpicomm, mpierr)
1383 : #else
1384 : call mpi_allreduce(work, seed, 8, mpi_real4, mpi_sum, &
1385 0 : mpicomm, mpierr)
1386 : #endif
1387 :
1388 : #else
1389 0 : seed(1:8) = work(1:8)
1390 : #endif
1391 :
1392 : call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
1393 : &PRECISION&
1394 0 : &")
1395 : end subroutine
1396 :
1397 : logical function qr_pdlarfg2_1dcomm_check_&
1398 0 : &PRECISION &
1399 : (obj,seed,eps)
1400 : use precision
1401 : use elpa_abstract_impl
1402 : implicit none
1403 :
1404 : class(elpa_abstract_impl_t), intent(inout) :: obj
1405 :
1406 : ! input variables
1407 : real(kind=C_DATATYPE_KIND) :: seed(*)
1408 : integer(kind=ik) :: eps
1409 :
1410 : ! local scalars
1411 : real(kind=C_DATATYPE_KIND) :: epsd,first,second,first_second,estimate
1412 : logical :: accurate
1413 : real(kind=C_DATATYPE_KIND) :: dot11,dot12,dot22
1414 : real(kind=C_DATATYPE_KIND) :: top11,top12,top21,top22
1415 : call obj%timer%start("qr_pdlarfg2_1dcomm_check_&
1416 : &PRECISION&
1417 0 : &")
1418 :
1419 0 : EPSD = EPS
1420 :
1421 0 : top11 = seed(1)
1422 0 : dot11 = seed(2)
1423 0 : top12 = seed(3)
1424 0 : dot12 = seed(4)
1425 :
1426 0 : top21 = seed(5)
1427 0 : top22 = seed(6)
1428 0 : dot22 = seed(7)
1429 :
1430 : ! reconstruct the whole inner products
1431 : ! (including squares of the top elements)
1432 0 : first = dot11 + top11*top11
1433 0 : second = dot22 + top22*top22 + top12*top12
1434 0 : first_second = dot12 + top11*top12
1435 :
1436 : ! zero Householder Vector (zero norm) case
1437 : #ifdef DOUBLE_PRECISION_REAL
1438 0 : if (first*second .eq. 0.0_rk8) then
1439 : #else
1440 0 : if (first*second .eq. 0.0_rk4) then
1441 : #endif
1442 : qr_pdlarfg2_1dcomm_check_&
1443 : &PRECISION &
1444 0 : = .false.
1445 : call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
1446 : &PRECISION&
1447 0 : &")
1448 :
1449 0 : return
1450 : end if
1451 :
1452 0 : estimate = abs((first_second*first_second)/(first*second))
1453 :
1454 : !print *,'estimate:',estimate
1455 :
1456 : ! if accurate the following check holds
1457 : #ifdef DOUBLE_PRECISION_REAL
1458 0 : accurate = (estimate .LE. (epsd/(1.0_rk8+epsd)))
1459 : #else
1460 0 : accurate = (estimate .LE. (epsd/(1.0_rk4+epsd)))
1461 : #endif
1462 : qr_pdlarfg2_1dcomm_check_&
1463 : &PRECISION &
1464 0 : = accurate
1465 : call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
1466 : &PRECISION&
1467 0 : &")
1468 :
1469 0 : end function
1470 :
1471 : ! id=0: first Vector
1472 : ! id=1: second Vector
1473 : subroutine qr_pdlarfg2_1dcomm_vector_&
1474 0 : &PRECISION &
1475 : (obj,x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
1476 : use precision
1477 : use elpa1_impl
1478 : use qr_utils_mod
1479 : use elpa_abstract_impl
1480 : implicit none
1481 : class(elpa_abstract_impl_t), intent(inout) :: obj
1482 : ! input variables (local)
1483 : integer(kind=ik) :: incx
1484 : real(kind=C_DATATYPE_KIND) :: x(*),seed(*),tau
1485 :
1486 : ! input variables (global)
1487 : integer(kind=ik) :: n,nb,idx,id,rev,mpicomm
1488 :
1489 : ! output variables (global)
1490 :
1491 : ! external functions
1492 : #ifdef DOUBLE_PRECISION_REAL
1493 : real(kind=C_DATATYPE_KIND), external :: dlapy2
1494 : external :: dscal
1495 : #else
1496 : real(kind=rk4), external :: slapy2
1497 : external :: sscal
1498 : #endif
1499 : ! local scalars
1500 : integer(kind=ik) :: mpirank,mpirank_top,mpiprocs,mpierr
1501 : real(kind=C_DATATYPE_KIND) :: alpha,dot,beta,xnorm
1502 : integer(kind=ik) :: local_size,baseoffset,local_offset,top,topidx
1503 : call obj%timer%start("qr_pdlarfg2_1dcomm_vector_&
1504 : &PRECISION&
1505 0 : &")
1506 :
1507 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
1508 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
1509 : call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
1510 0 : local_size,baseoffset,local_offset)
1511 :
1512 0 : local_offset = local_offset * incx
1513 :
1514 : ! Processor id for global index of top element
1515 0 : mpirank_top = MOD((idx-1)/nb,mpiprocs)
1516 0 : if (mpirank .eq. mpirank_top) then
1517 0 : topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
1518 0 : top = 1+(topidx-1)*incx
1519 : else
1520 0 : top = -99
1521 0 : stop
1522 : end if
1523 :
1524 0 : alpha = seed(id*5+1)
1525 0 : dot = seed(id*5+2)
1526 :
1527 0 : xnorm = sqrt(dot)
1528 : #ifdef DOUBLE_PRECISION_REAL
1529 0 : if (xnorm .eq. 0.0_rk8) then
1530 : ! H = I
1531 :
1532 0 : tau = 0.0_rk8
1533 : #else
1534 0 : if (xnorm .eq. 0.0_rk4) then
1535 : ! H = I
1536 :
1537 0 : tau = 0.0_rk4
1538 : #endif
1539 : else
1540 : ! General case
1541 : #ifdef DOUBLE_PRECISION_REAL
1542 0 : beta = sign(dlapy2(alpha, xnorm), alpha)
1543 : #else
1544 0 : beta = sign(slapy2(alpha, xnorm), alpha)
1545 : #endif
1546 0 : tau = (beta+alpha) / beta
1547 :
1548 : !print *,'hg2',tau,xnorm,alpha
1549 : #ifdef DOUBLE_PRECISION_REAL
1550 : call dscal(local_size, 1.0_rk8/(beta+alpha), &
1551 0 : x(local_offset), incx)
1552 : #else
1553 : call sscal(local_size, 1.0_rk4/(beta+alpha), &
1554 0 : x(local_offset), incx)
1555 : #endif
1556 :
1557 : ! TODO: reimplement norm rescale method of
1558 : ! original PDLARFG using mpi?
1559 :
1560 0 : if (mpirank .eq. mpirank_top) then
1561 0 : x(top) = -beta
1562 : end if
1563 :
1564 0 : seed(8) = beta
1565 : end if
1566 : call obj%timer%stop("qr_pdlarfg2_1dcomm_vector_&
1567 : &PRECISION&
1568 0 : &")
1569 :
1570 0 : end subroutine
1571 :
1572 : subroutine qr_pdlarfg2_1dcomm_update_&
1573 0 : &PRECISION &
1574 0 : (obj,v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
1575 : use precision
1576 : use elpa1_impl
1577 : use qr_utils_mod
1578 : use elpa_abstract_impl
1579 : implicit none
1580 : class(elpa_abstract_impl_t), intent(inout) :: obj
1581 : ! input variables (local)
1582 : integer(kind=ik) :: incv,lda
1583 : real(kind=C_DATATYPE_KIND) :: v(*),a(lda,*),seed(*)
1584 :
1585 : ! input variables (global)
1586 : integer(kind=ik) :: n,baseidx,idx,nb,rev,mpicomm
1587 :
1588 : ! output variables (global)
1589 :
1590 : ! external functions
1591 : external daxpy
1592 :
1593 : ! local scalars
1594 : integer(kind=ik) :: mpirank,mpiprocs,mpierr
1595 : integer(kind=ik) :: local_size,local_offset,baseoffset
1596 : real(kind=C_DATATYPE_KIND) :: z,coeff,beta
1597 : real(kind=C_DATATYPE_KIND) :: dot11,dot12,dot22
1598 : real(kind=C_DATATYPE_KIND) :: top11,top12,top21,top22
1599 : call obj%timer%start("qr_pdlarfg2_1dcomm_update_&
1600 : &PRECISION&
1601 0 : &")
1602 :
1603 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
1604 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
1605 :
1606 : ! seed should be updated by previous householder generation
1607 : ! Update inner product of this column and next column Vector
1608 0 : top11 = seed(1)
1609 0 : dot11 = seed(2)
1610 0 : top12 = seed(3)
1611 0 : dot12 = seed(4)
1612 :
1613 0 : top21 = seed(5)
1614 0 : top22 = seed(6)
1615 0 : dot22 = seed(7)
1616 0 : beta = seed(8)
1617 :
1618 : call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
1619 0 : local_size,baseoffset,local_offset)
1620 0 : baseoffset = baseoffset * incv
1621 :
1622 : ! zero Householder Vector (zero norm) case
1623 : #ifdef DOUBLE_PRECISION_REAL
1624 0 : if (beta .eq. 0.0_rk8) then
1625 : #else
1626 0 : if (beta .eq. 0.0_rk4) then
1627 : #endif
1628 :
1629 : call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
1630 : &PRECISION&
1631 0 : &")
1632 0 : return
1633 : end if
1634 0 : z = (dot12 + top11 * top12) / beta + top12
1635 :
1636 : !print *,'hg2 update:',baseidx,idx,mpirank,local_size
1637 : #ifdef DOUBLE_PRECISION_REAL
1638 0 : call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
1639 : #else
1640 0 : call saxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
1641 : #endif
1642 : ! prepare a full dot22 for update
1643 0 : dot22 = dot22 + top22*top22
1644 :
1645 : ! calculate coefficient
1646 0 : COEFF = z / (top11 + beta)
1647 :
1648 : ! update inner product of next Vector
1649 0 : dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)
1650 :
1651 : ! update dot12 value to represent update with first Vector
1652 : ! (needed for T matrix)
1653 0 : dot12 = dot12 - COEFF * dot11
1654 :
1655 : ! update top element of next Vector
1656 0 : top22 = top22 - coeff * top21
1657 0 : seed(6) = top22
1658 :
1659 : ! restore separated dot22 for Vector generation
1660 0 : seed(7) = dot22 - top22*top22
1661 :
1662 : !------------------------------------------------------
1663 : ! prepare elements for T matrix
1664 0 : seed(4) = dot12
1665 :
1666 : ! prepare dot matrix for fuse element of T matrix
1667 : ! replace top11 value with -beta1
1668 0 : seed(1) = beta
1669 : call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
1670 : &PRECISION&
1671 0 : &")
1672 :
1673 : end subroutine
1674 :
1675 : ! run this function after second Vector
1676 : subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix_&
1677 0 : &PRECISION &
1678 0 : (obj,seed,tau,t,ldt)
1679 : use precision
1680 : use elpa_abstract_impl
1681 : implicit none
1682 : class(elpa_abstract_impl_t), intent(inout) :: obj
1683 :
1684 : integer(kind=ik) :: ldt
1685 : real(kind=C_DATATYPE_KIND) :: seed(*),t(ldt,*),tau(*)
1686 : real(kind=C_DATATYPE_KIND) :: dot12,beta1,top21,beta2
1687 : call obj%timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
1688 : &PRECISION&
1689 0 : &")
1690 :
1691 0 : beta1 = seed(1)
1692 0 : dot12 = seed(4)
1693 0 : top21 = seed(5)
1694 0 : beta2 = seed(8)
1695 :
1696 : !print *,'beta1 beta2',beta1,beta2
1697 :
1698 0 : dot12 = dot12 / beta2 + top21
1699 0 : dot12 = -(dot12 / beta1)
1700 :
1701 0 : t(1,1) = tau(1)
1702 0 : t(1,2) = dot12
1703 0 : t(2,2) = tau(2)
1704 : call obj%timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
1705 : &PRECISION&
1706 0 : &")
1707 :
1708 0 : end subroutine
1709 :
1710 : subroutine qr_pdlarfgk_1dcomm_&
1711 0 : &PRECISION &
1712 0 : (obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
1713 : use precision
1714 : use elpa_abstract_impl
1715 : implicit none
1716 : class(elpa_abstract_impl_t), intent(inout) :: obj
1717 : ! parameter setup
1718 :
1719 : ! input variables (local)
1720 : integer(kind=ik) :: lda,lwork,ldv,ldt
1721 : real(kind=C_DATATYPE_KIND) :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
1722 :
1723 : ! input variables (global)
1724 : integer(kind=ik) :: m,k,idx,baseidx,mb,rev,mpicomm
1725 : #ifdef USE_ASSUMED_SIZE_QR
1726 : integer(kind=ik) ::PQRPARAM(*)
1727 : #else
1728 : integer(kind=ik) :: PQRPARAM(:)
1729 : #endif
1730 : ! output variables (global)
1731 : integer(kind=ik) :: actualk
1732 :
1733 : ! local scalars
1734 : integer(kind=ik) :: ivector
1735 : real(kind=C_DATATYPE_KIND) :: pdlarfg_size(1),pdlarf_size(1)
1736 : real(kind=C_DATATYPE_KIND) :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
1737 : real(kind=C_DATATYPE_KIND) :: pdlarfgk_1dcomm_update_size(1)
1738 : integer(kind=ik) :: seedC_size,seedC_offset
1739 : integer(kind=ik) :: seedD_size,seedD_offset
1740 : integer(kind=ik) :: work_offset
1741 : call obj%timer%start("qr_pdlarfgk_1dcomm_&
1742 : &PRECISION&
1743 0 : &")
1744 :
1745 0 : seedC_size = k*k
1746 0 : seedC_offset = 1
1747 0 : seedD_size = k*k
1748 0 : seedD_offset = seedC_offset + seedC_size
1749 0 : work_offset = seedD_offset + seedD_size
1750 :
1751 0 : if (lwork .eq. -1) then
1752 : call qr_pdlarfg_1dcomm_&
1753 : &PRECISION &
1754 0 : (obj, a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
1755 :
1756 : call qr_pdlarfl_1dcomm_&
1757 : &PRECISION &
1758 0 : (v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
1759 : call qr_pdlarfgk_1dcomm_seed_&
1760 : &PRECISION &
1761 0 : (obj,a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
1762 : #ifdef USE_ASSUMED_SIZE_QR
1763 : !call qr_pdlarfgk_1dcomm_check_&
1764 : !&PRECISION &
1765 : !(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
1766 : call qr_pdlarfgk_1dcomm_check_improved_&
1767 : &PRECISION &
1768 : (obj,work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
1769 : #else
1770 : !call qr_pdlarfgk_1dcomm_check_&
1771 : !&PRECISION &
1772 : !(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
1773 : call qr_pdlarfgk_1dcomm_check_improved_&
1774 : &PRECISION &
1775 0 : (obj,work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
1776 : #endif
1777 : call qr_pdlarfgk_1dcomm_update_&
1778 : &PRECISION &
1779 : (obj,a,lda,baseidx,pdlarfgk_1dcomm_update_size(1), &
1780 0 : -1,work,work,k,k,1,work,m,mb,rev,mpicomm)
1781 : work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
1782 0 : pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=C_DATATYPE_KIND)
1783 :
1784 : call obj%timer%stop("qr_pdlarfgk_1dcomm_&
1785 : &PRECISION&
1786 0 : &")
1787 :
1788 0 : return
1789 : end if
1790 :
1791 : call qr_pdlarfgk_1dcomm_seed_&
1792 : &PRECISION &
1793 : (obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
1794 0 : work(seedD_offset),m,k,mb,mpicomm)
1795 : #ifdef USE_ASSUMED_SIZE_QR
1796 : !call qr_pdlarfgk_1dcomm_check_&
1797 : !&PRECISION &
1798 : !(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
1799 : call qr_pdlarfgk_1dcomm_check_improved_&
1800 : &PRECISION &
1801 : (obj,work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
1802 :
1803 : #else
1804 : !call qr_pdlarfgk_1dcomm_check_&
1805 : !&PRECISION &
1806 : !(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
1807 : call qr_pdlarfgk_1dcomm_check_improved_&
1808 : &PRECISION &
1809 : (obj,work(seedC_offset),work(seedD_offset), &
1810 0 : k,PQRPARAM(:),work(work_offset),lwork,actualk)
1811 : #endif
1812 : !print *,'possible rank:', actualk
1813 :
1814 : ! override useful for debugging
1815 : !actualk = 1
1816 : !actualk = k
1817 : !actualk= min(actualk,2)
1818 0 : do ivector=1,actualk
1819 : call qr_pdlarfgk_1dcomm_vector_&
1820 : &PRECISION &
1821 : (obj,a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
1822 : work(seedC_offset),work(seedD_offset),k, &
1823 0 : ivector,m,mb,rev,mpicomm)
1824 :
1825 : call qr_pdlarfgk_1dcomm_update_&
1826 : &PRECISION &
1827 : (obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
1828 : work(seedD_offset),k,actualk,ivector,tau, &
1829 0 : m,mb,rev,mpicomm)
1830 :
1831 : call qr_pdlarfg_copy_1dcomm_&
1832 : &PRECISION &
1833 : (obj,a(1,k-ivector+1),1, &
1834 : v(1,k-ivector+1),1, &
1835 0 : m,baseidx,idx-ivector+1,mb,1,mpicomm)
1836 : end do
1837 :
1838 : ! generate final T matrix and convert preliminary tau values into real ones
1839 : call qr_pdlarfgk_1dcomm_generateT_&
1840 : &PRECISION &
1841 0 : (obj,work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)
1842 :
1843 : call obj%timer%stop("qr_pdlarfgk_1dcomm_&
1844 : &PRECISION&
1845 0 : &")
1846 : end subroutine
1847 :
1848 : subroutine qr_pdlarfgk_1dcomm_seed_&
1849 0 : &PRECISION &
1850 0 : (obj,a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
1851 : use precision
1852 : use elpa1_impl
1853 : use qr_utils_mod
1854 : use elpa_abstract_impl
1855 : implicit none
1856 : class(elpa_abstract_impl_t), intent(inout) :: obj
1857 : ! parameter setup
1858 :
1859 : ! input variables (local)
1860 : integer(kind=ik) :: lda,lwork
1861 : real(kind=C_DATATYPE_KIND) :: a(lda,*), work(*)
1862 :
1863 : ! input variables (global)
1864 : integer(kind=ik) :: m,k,baseidx,mb,mpicomm
1865 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*)
1866 :
1867 : ! output variables (global)
1868 :
1869 : ! derived input variables from QR_PQRPARAM
1870 :
1871 : ! local scalars
1872 : integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top
1873 : integer(kind=ik) :: icol,irow,lidx,remsize
1874 : integer(kind=ik) :: remaining_rank
1875 :
1876 : integer(kind=ik) :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
1877 : integer(kind=ik) :: localoffset,localsize,baseoffset
1878 : call obj%timer%start("qr_pdlarfgk_1dcomm_seed_&
1879 : &PRECISION&
1880 0 : &")
1881 :
1882 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
1883 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
1884 0 : C_size = k*k
1885 0 : D_size = k*k
1886 0 : sendoffset = 1
1887 0 : sendrecv_size = C_size+D_size
1888 0 : recvoffset = sendoffset + sendrecv_size
1889 :
1890 0 : if (lwork .eq. -1) then
1891 : #ifdef DOUBLE_PRECISION_REAL
1892 0 : work(1) = real(2*sendrecv_size,kind=C_DATATYPE_KIND)
1893 : #else
1894 0 : work(1) = real(2*sendrecv_size,kind=rk4)
1895 : #endif
1896 : call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
1897 : &PRECISION&
1898 0 : &")
1899 :
1900 0 : return
1901 : end if
1902 :
1903 : ! clear buffer
1904 : #ifdef DOUBLE_PRECISION_REAL
1905 0 : work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk8
1906 : #else
1907 0 : work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk4
1908 : #endif
1909 : ! collect C part
1910 0 : do icol=1,k
1911 :
1912 0 : remaining_rank = k
1913 0 : do while (remaining_rank .gt. 0)
1914 0 : irow = k - remaining_rank + 1
1915 0 : lidx = baseidx - remaining_rank + 1
1916 :
1917 : ! determine chunk where the current top element is located
1918 0 : mpirank_top = MOD((lidx-1)/mb,mpiprocs)
1919 :
1920 : ! limit max number of remaining elements of this chunk to the block
1921 : ! distribution parameter
1922 0 : remsize = min(remaining_rank,mb)
1923 :
1924 : ! determine the number of needed elements in this chunk
1925 : call local_size_offset_1d(lidx+remsize-1,mb, &
1926 : lidx,lidx,0, &
1927 : mpirank_top,mpiprocs, &
1928 0 : localsize,baseoffset,localoffset)
1929 :
1930 : !print *,'local rank',localsize,localoffset
1931 :
1932 0 : if (mpirank .eq. mpirank_top) then
1933 : ! copy elements to buffer
1934 : work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
1935 0 : = a(localoffset:localoffset+remsize-1,icol)
1936 : end if
1937 :
1938 : ! jump to next chunk
1939 0 : remaining_rank = remaining_rank - localsize
1940 : end do
1941 : end do
1942 :
1943 : ! collect D part
1944 : call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
1945 : mpirank,mpiprocs, &
1946 0 : localsize,baseoffset,localoffset)
1947 :
1948 : !print *,'localsize',localsize,localoffset
1949 : #ifdef DOUBLE_PRECISION_REAL
1950 0 : if (localsize > 0) then
1951 : call dsyrk("Upper", "Trans", k, localsize, &
1952 : 1.0_rk8, a(localoffset,1), lda, &
1953 0 : 0.0_rk8, work(sendoffset+C_size), k)
1954 : else
1955 0 : work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk8
1956 : end if
1957 : #else
1958 0 : if (localsize > 0) then
1959 : call ssyrk("Upper", "Trans", k, localsize, &
1960 : 1.0_rk4, a(localoffset,1), lda, &
1961 0 : 0.0_rk4, work(sendoffset+C_size), k)
1962 : else
1963 0 : work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk4
1964 : end if
1965 : #endif
1966 :
1967 : ! TODO: store symmetric part more efficiently
1968 :
1969 : ! allreduce operation on results
1970 : #ifdef WITH_MPI
1971 :
1972 : #ifdef DOUBLE_PRECISION_REAL
1973 : call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
1974 0 : mpi_real8,mpi_sum,mpicomm,mpierr)
1975 : #else
1976 : call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
1977 0 : mpi_real4,mpi_sum,mpicomm,mpierr)
1978 : #endif
1979 :
1980 : #else
1981 0 : work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
1982 : #endif
1983 : ! unpack result from buffer into seedC and seedD
1984 : #ifdef DOUBLE_PRECISION_REAL
1985 0 : seedC(1:k,1:k) = 0.0_rk8
1986 : #else
1987 0 : seedC(1:k,1:k) = 0.0_rk4
1988 : #endif
1989 0 : do icol=1,k
1990 0 : seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
1991 : end do
1992 : #ifdef DOUBLE_PRECISION_REAL
1993 0 : seedD(1:k,1:k) = 0.0_rk8
1994 : #else
1995 0 : seedD(1:k,1:k) = 0.0_rk4
1996 : #endif
1997 0 : do icol=1,k
1998 0 : seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
1999 : end do
2000 :
2001 : call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
2002 : &PRECISION&
2003 0 : &")
2004 :
2005 : end subroutine
2006 :
2007 : ! k is assumed to be larger than two
2008 : subroutine qr_pdlarfgk_1dcomm_check_improved_&
2009 0 : &PRECISION &
2010 0 : (obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
2011 : use precision
2012 : use elpa_abstract_impl
2013 : implicit none
2014 : class(elpa_abstract_impl_t), intent(inout) :: obj
2015 : ! input variables (global)
2016 : integer(kind=ik) :: k,lwork
2017 : #ifdef USE_ASSUMED_SIZE_QR
2018 : integer(kind=ik) :: PQRPARAM(*)
2019 :
2020 : #else
2021 : integer(kind=ik) :: PQRPARAM(:)
2022 : #endif
2023 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*),work(k,*)
2024 :
2025 : ! output variables (global)
2026 : integer(kind=ik) :: possiblerank
2027 :
2028 : ! derived input variables from QR_PQRPARAM
2029 : integer(kind=ik) :: eps
2030 :
2031 : ! local variables
2032 : integer(kind=ik) :: i,j,l
2033 : real(kind=C_DATATYPE_KIND) :: sum_squares,diagonal_square,epsd,diagonal_root
2034 : real(kind=C_DATATYPE_KIND) :: dreverse_matrix_work(1)
2035 :
2036 : ! external functions
2037 : #ifdef DOUBLE_PRECISION_REAL
2038 : real(kind=C_DATATYPE_KIND), external :: ddot,dlapy2,dnrm2
2039 : external :: dscal
2040 : #else
2041 : real(kind=rk4), external :: sdot,slapy2,snrm2
2042 : external :: sscal
2043 : #endif
2044 :
2045 : call obj%timer%start("qr_pdlarfgk_1dcomm_check_improved_&
2046 : &PRECISION&
2047 0 : &")
2048 :
2049 0 : if (lwork .eq. -1) then
2050 : call reverse_matrix_local_&
2051 : &PRECISION &
2052 0 : (1,k,k,work,k,dreverse_matrix_work,-1)
2053 : #ifdef DOUBLE_PRECISION_REAL
2054 0 : work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
2055 : #else
2056 0 : work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
2057 : #endif
2058 :
2059 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
2060 : &PRECISION&
2061 0 : &")
2062 0 : return
2063 : end if
2064 :
2065 0 : eps = PQRPARAM(3)
2066 :
2067 0 : if (eps .eq. 0) then
2068 0 : possiblerank = k
2069 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
2070 : &PRECISION&
2071 0 : &")
2072 0 : return
2073 : end if
2074 : #ifdef DOUBLE_PRECISION_REAL
2075 0 : epsd = real(eps,kind=C_DATATYPE_KIND)
2076 : #else
2077 0 : epsd = real(eps,kind=rk4)
2078 : #endif
2079 :
2080 : ! build complete inner product from seedC and seedD
2081 : ! copy seedD to work
2082 0 : work(:,1:k) = seedD(:,1:k)
2083 :
2084 : ! add inner products of seedC to work
2085 : #ifdef DOUBLE_PRECISION_REAL
2086 : call dsyrk("Upper", "Trans", k, k, &
2087 : 1.0_rk8, seedC(1,1), k, &
2088 0 : 1.0_rk8, work, k)
2089 : #else
2090 : call ssyrk("Upper", "Trans", k, k, &
2091 : 1.0_rk4, seedC(1,1), k, &
2092 0 : 1.0_rk4, work, k)
2093 :
2094 : #endif
2095 :
2096 : ! TODO: optimize this part!
2097 : call reverse_matrix_local_&
2098 : &PRECISION &
2099 0 : (0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
2100 : call reverse_matrix_local_&
2101 : &PRECISION &
2102 0 : (1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
2103 :
2104 : ! transpose matrix
2105 0 : do i=1,k
2106 0 : do j=i+1,k
2107 0 : work(i,j) = work(j,i)
2108 : end do
2109 : end do
2110 :
2111 :
2112 : ! do cholesky decomposition
2113 0 : i = 0
2114 0 : do while ((i .lt. k))
2115 0 : i = i + 1
2116 :
2117 0 : diagonal_square = abs(work(i,i))
2118 0 : diagonal_root = sqrt(diagonal_square)
2119 :
2120 : ! zero Householder Vector (zero norm) case
2121 : #ifdef DOUBLE_PRECISION_REAL
2122 0 : if ((abs(diagonal_square) .eq. 0.0_rk8) .or. (abs(diagonal_root) .eq. 0.0_rk8)) then
2123 : #else
2124 0 : if ((abs(diagonal_square) .eq. 0.0_rk4) .or. (abs(diagonal_root) .eq. 0.0_rk4)) then
2125 : #endif
2126 0 : possiblerank = max(i-1,1)
2127 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
2128 : &PRECISION&
2129 0 : &")
2130 0 : return
2131 : end if
2132 :
2133 : ! check if relative error is bounded for each Householder Vector
2134 : ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
2135 : ! holds.
2136 : ! first Householder Vector is considered as "stable".
2137 :
2138 0 : do j=i+1,k
2139 0 : work(i,j) = work(i,j) / diagonal_root
2140 0 : do l=i+1,j
2141 0 : work(l,j) = work(l,j) - work(i,j) * work(i,l)
2142 : end do
2143 : end do
2144 : !print *,'cholesky step done'
2145 :
2146 : ! build sum of squares
2147 : #ifdef DOUBLE_PRECISION_REAL
2148 0 : if (i .eq. 1) then
2149 0 : sum_squares = 0.0_rk8
2150 : else
2151 0 : sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
2152 : end if
2153 : #else
2154 0 : if (i .eq. 1) then
2155 0 : sum_squares = 0.0_rk4
2156 : else
2157 0 : sum_squares = sdot(i-1,work(1,i),1,work(1,i),1)
2158 : end if
2159 : #endif
2160 0 : if (sum_squares .ge. (epsd * diagonal_square)) then
2161 0 : possiblerank = max(i-1,1)
2162 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
2163 : &PRECISION&
2164 0 : &")
2165 0 : return
2166 : end if
2167 : end do
2168 :
2169 0 : possiblerank = i
2170 : !print *,'possible rank', possiblerank
2171 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
2172 : &PRECISION&
2173 0 : &")
2174 :
2175 : end subroutine
2176 :
2177 : ! TODO: zero Householder Vector (zero norm) case
2178 : ! - check alpha values as well (from seedC)
2179 : subroutine qr_pdlarfgk_1dcomm_check_&
2180 0 : &PRECISION &
2181 0 : (obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
2182 : use precision
2183 : use qr_utils_mod
2184 : use elpa_abstract_impl
2185 : implicit none
2186 : class(elpa_abstract_impl_t), intent(inout) :: obj
2187 : ! parameter setup
2188 :
2189 : ! input variables (local)
2190 :
2191 : ! input variables (global)
2192 : integer(kind=ik) :: k,lwork
2193 : #ifdef USE_ASSUMED_SIZE_QR
2194 : integer(kind=ik) :: PQRPARAM(*)
2195 : #else
2196 : integer(kind=ik) :: PQRPARAM(:)
2197 : #endif
2198 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*),work(k,*)
2199 :
2200 : ! output variables (global)
2201 : integer(kind=ik) :: possiblerank
2202 :
2203 : ! derived input variables from QR_PQRPARAM
2204 : integer(kind=ik) :: eps
2205 :
2206 : ! local scalars
2207 : integer(kind=ik) :: icol,isqr,iprod
2208 : real(kind=C_DATATYPE_KIND) :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
2209 : real(kind=C_DATATYPE_KIND) :: dreverse_matrix_work(1)
2210 : call obj%timer%start("qr_pdlarfgk_1dcomm_check_&
2211 : &PRECISION&
2212 0 : &")
2213 0 : if (lwork .eq. -1) then
2214 : call reverse_matrix_local_&
2215 : &PRECISION &
2216 0 : (1,k,k,work,k,dreverse_matrix_work,-1)
2217 : #ifdef DOUBLE_PRECISION_REAL
2218 0 : work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
2219 : #else
2220 0 : work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
2221 : #endif
2222 :
2223 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2224 : &PRECISION&
2225 0 : &")
2226 :
2227 0 : return
2228 : end if
2229 :
2230 0 : eps = PQRPARAM(3)
2231 :
2232 0 : if (eps .eq. 0) then
2233 0 : possiblerank = k
2234 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2235 : &PRECISION&
2236 0 : &")
2237 0 : return
2238 : end if
2239 : #ifdef DOUBLE_PRECISION_REAL
2240 0 : epsd = real(eps,kind=C_DATATYPE_KIND)
2241 : #else
2242 0 : epsd = real(eps,kind=rk4)
2243 : #endif
2244 :
2245 : ! copy seedD to work
2246 0 : work(:,1:k) = seedD(:,1:k)
2247 :
2248 : ! add inner products of seedC to work
2249 : #ifdef DOUBLE_PRECISION_REAL
2250 : call dsyrk("Upper", "Trans", k, k, &
2251 : 1.0_rk8, seedC(1,1), k, &
2252 0 : 1.0_rk8, work, k)
2253 : #else
2254 : call ssyrk("Upper", "Trans", k, k, &
2255 : 1.0_rk4, seedC(1,1), k, &
2256 0 : 1.0_rk4, work, k)
2257 : #endif
2258 :
2259 : ! TODO: optimize this part!
2260 : call reverse_matrix_local_&
2261 : &PRECISION &
2262 0 : (0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
2263 : call reverse_matrix_local_&
2264 : &PRECISION &
2265 0 : (1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
2266 :
2267 : ! transpose matrix
2268 0 : do icol=1,k
2269 0 : do isqr=icol+1,k
2270 0 : work(icol,isqr) = work(isqr,icol)
2271 : end do
2272 : end do
2273 :
2274 : ! work contains now the full inner product of the global (sub-)matrix
2275 0 : do icol=1,k
2276 : ! zero Householder Vector (zero norm) case
2277 : #ifdef DOUBLE_PRECISION_REAL
2278 0 : if (abs(work(icol,icol)) .eq. 0.0_rk8) then
2279 : #else
2280 0 : if (abs(work(icol,icol)) .eq. 0.0_rk4) then
2281 : #endif
2282 : !print *,'too small ', icol, work(icol,icol)
2283 0 : possiblerank = max(icol,1)
2284 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2285 : &PRECISION&
2286 0 : &")
2287 0 : return
2288 : end if
2289 :
2290 : #ifdef DOUBLE_PRECISION_REAL
2291 0 : sum_sqr = 0.0_rk8
2292 0 : do isqr=1,icol-1
2293 0 : sum_products = 0.0_rk8
2294 : #else
2295 0 : sum_sqr = 0.0_rk4
2296 0 : do isqr=1,icol-1
2297 0 : sum_products = 0.0_rk4
2298 : #endif
2299 0 : do iprod=1,isqr-1
2300 0 : sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
2301 : end do
2302 :
2303 : !print *,'divisor',icol,isqr,work(isqr,isqr)
2304 0 : temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
2305 0 : work(isqr,icol) = temp
2306 0 : sum_sqr = sum_sqr + temp*temp
2307 : end do
2308 :
2309 : ! calculate diagonal value
2310 0 : diff = work(icol,icol) - sum_sqr
2311 : #ifdef DOUBLE_PRECISION_REAL
2312 0 : if (diff .lt. 0.0_rk8) then
2313 : #else
2314 0 : if (diff .lt. 0.0_rk4) then
2315 : #endif
2316 : ! we definitely have a problem now
2317 0 : possiblerank = icol-1 ! only decompose to previous column (including)
2318 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2319 : &PRECISION&
2320 0 : &")
2321 0 : return
2322 : end if
2323 0 : work(icol,icol) = sqrt(diff)
2324 : ! calculate orthogonality
2325 : #ifdef DOUBLE_PRECISION_REAL
2326 0 : ortho = 0.0_rk8
2327 0 : do isqr=1,icol-1
2328 0 : ortho_sum = 0.0_rk8
2329 : #else
2330 0 : ortho = 0.0_rk4
2331 0 : do isqr=1,icol-1
2332 0 : ortho_sum = 0.0_rk4
2333 : #endif
2334 0 : do iprod=isqr,icol-1
2335 0 : temp = work(isqr,iprod)*work(isqr,iprod)
2336 : !print *,'ortho ', work(iprod,iprod)
2337 0 : temp = temp / (work(iprod,iprod)*work(iprod,iprod))
2338 0 : ortho_sum = ortho_sum + temp
2339 : end do
2340 0 : ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
2341 : end do
2342 :
2343 : ! ---------------- with division by zero ----------------------- !
2344 :
2345 : !ortho = ortho / diff;
2346 :
2347 : ! if current estimate is not accurate enough, the following check holds
2348 : !if (ortho .gt. epsd) then
2349 : ! possiblerank = icol-1 ! only decompose to previous column (including)
2350 : ! return
2351 : !end if
2352 :
2353 : ! ---------------- without division by zero ----------------------- !
2354 :
2355 : ! if current estimate is not accurate enough, the following check holds
2356 0 : if (ortho .gt. epsd * diff) then
2357 0 : possiblerank = icol-1 ! only decompose to previous column (including)
2358 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2359 : &PRECISION&
2360 0 : &")
2361 0 : return
2362 : end if
2363 : end do
2364 :
2365 : ! if we get to this point, the accuracy condition holds for the whole block
2366 0 : possiblerank = k
2367 : call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
2368 : &PRECISION&
2369 0 : &")
2370 : end subroutine
2371 :
2372 : !sidx: seed idx
2373 : !k: max rank used during seed phase
2374 : !rank: actual rank (k >= rank)
2375 : subroutine qr_pdlarfgk_1dcomm_vector_&
2376 0 : &PRECISION &
2377 0 : (obj,x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
2378 : use precision
2379 : use elpa1_impl
2380 : use qr_utils_mod
2381 : use elpa_abstract_impl
2382 : implicit none
2383 : class(elpa_abstract_impl_t), intent(inout) :: obj
2384 : ! input variables (local)
2385 : integer(kind=ik) :: incx
2386 : real(kind=C_DATATYPE_KIND) :: x(*),tau
2387 :
2388 : ! input variables (global)
2389 : integer(kind=ik) :: n,nb,baseidx,rev,mpicomm,k,sidx
2390 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*)
2391 :
2392 : ! output variables (global)
2393 :
2394 : ! external functions
2395 : #ifdef DOUBLE_PRECISION_REAL
2396 : real(kind=C_DATATYPE_KIND), external :: dlapy2,dnrm2
2397 : external :: dscal
2398 : #else
2399 : real(kind=rk4), external :: slapy2,snrm2
2400 : external :: sscal
2401 : #endif
2402 :
2403 : ! local scalars
2404 : integer(kind=ik) :: mpirank,mpirank_top,mpiprocs,mpierr
2405 : real(kind=C_DATATYPE_KIND) :: alpha,dot,beta,xnorm
2406 : integer(kind=ik) :: local_size,baseoffset,local_offset,top,topidx
2407 : integer(kind=ik) :: lidx
2408 : call obj%timer%start("qr_pdlarfgk_1dcomm_vector_&
2409 : &PRECISION&
2410 0 : &")
2411 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
2412 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
2413 0 : lidx = baseidx-sidx+1
2414 : call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
2415 0 : local_size,baseoffset,local_offset)
2416 :
2417 0 : local_offset = local_offset * incx
2418 :
2419 : ! Processor id for global index of top element
2420 0 : mpirank_top = MOD((lidx-1)/nb,mpiprocs)
2421 0 : if (mpirank .eq. mpirank_top) then
2422 0 : topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
2423 0 : top = 1+(topidx-1)*incx
2424 : end if
2425 :
2426 0 : alpha = seedC(k-sidx+1,k-sidx+1)
2427 0 : dot = seedD(k-sidx+1,k-sidx+1)
2428 : ! assemble actual norm from both seed parts
2429 : #ifdef DOUBLE_PRECISION_REAL
2430 0 : xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))
2431 :
2432 0 : if (xnorm .eq. 0.0_rk8) then
2433 0 : tau = 0.0_rk8
2434 : else
2435 :
2436 : ! General case
2437 :
2438 0 : beta = sign(dlapy2(alpha, xnorm), alpha)
2439 : ! store a preliminary version of beta in tau
2440 0 : tau = beta
2441 :
2442 : ! update global part
2443 : call dscal(local_size, 1.0_rk8/(beta+alpha), &
2444 0 : x(local_offset), incx)
2445 : #else
2446 0 : xnorm = slapy2(sqrt(dot), snrm2(k-sidx,seedC(1,k-sidx+1),1))
2447 :
2448 0 : if (xnorm .eq. 0.0_rk4) then
2449 0 : tau = 0.0_rk4
2450 : else
2451 :
2452 : ! General case
2453 :
2454 0 : beta = sign(slapy2(alpha, xnorm), alpha)
2455 : ! store a preliminary version of beta in tau
2456 0 : tau = beta
2457 :
2458 : ! update global part
2459 : call sscal(local_size, 1.0_rk4/(beta+alpha), &
2460 0 : x(local_offset), incx)
2461 :
2462 : #endif
2463 : ! do not update local part here due to
2464 : ! dependency of c Vector during update process
2465 :
2466 : ! TODO: reimplement norm rescale method of
2467 : ! original PDLARFG using mpi?
2468 :
2469 0 : if (mpirank .eq. mpirank_top) then
2470 0 : x(top) = -beta
2471 : end if
2472 : end if
2473 : call obj%timer%stop("qr_pdlarfgk_1dcomm_vector_&
2474 : &PRECISION&
2475 0 : &")
2476 :
2477 0 : end subroutine
2478 :
2479 : !k: original max rank used during seed function
2480 : !rank: possible rank as from check function
2481 : ! TODO: if rank is less than k, reduce buffersize in such a way
2482 : ! that only the required entries for the next pdlarfg steps are
2483 : ! computed
2484 : subroutine qr_pdlarfgk_1dcomm_update_&
2485 0 : &PRECISION &
2486 0 : (obj,a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
2487 : use precision
2488 : use elpa1_impl
2489 : use qr_utils_mod
2490 : use elpa_abstract_impl
2491 : implicit none
2492 : class(elpa_abstract_impl_t), intent(inout) :: obj
2493 : ! parameter setup
2494 : INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
2495 :
2496 : ! input variables (local)
2497 : integer(kind=ik) :: lda,lwork
2498 : real(kind=C_DATATYPE_KIND) :: a(lda,*),work(*)
2499 :
2500 : ! input variables (global)
2501 : integer(kind=ik) :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
2502 : real(kind=C_DATATYPE_KIND) :: beta
2503 :
2504 : ! output variables (global)
2505 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*),tau(*)
2506 :
2507 : ! derived input variables from QR_PQRPARAM
2508 :
2509 : ! local scalars
2510 : real(kind=C_DATATYPE_KIND) :: alpha
2511 : integer(kind=ik) :: coffset,zoffset,yoffset,voffset,buffersize
2512 : integer(kind=ik) :: mpirank,mpierr,mpiprocs,mpirank_top
2513 : integer(kind=ik) :: localsize,baseoffset,localoffset,topidx
2514 : integer(kind=ik) :: lidx
2515 : call obj%timer%start("qr_pdlarfgk_1dcomm_update_&
2516 : &PRECISION&
2517 0 : &")
2518 0 : if (lwork .eq. -1) then
2519 : ! buffer for c,z,y,v
2520 0 : work(1) = 4*k
2521 : call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
2522 : &PRECISION&
2523 0 : &")
2524 :
2525 0 : return
2526 : end if
2527 :
2528 : ! nothing to update anymore
2529 0 : if (sidx .gt. rank) then
2530 : call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
2531 : &PRECISION&
2532 0 : &")
2533 0 : return
2534 : endif
2535 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
2536 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
2537 0 : lidx = baseidx-sidx
2538 0 : if (lidx .lt. 1) then
2539 : call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
2540 : &PRECISION&
2541 0 : &")
2542 0 : return
2543 : endif
2544 :
2545 : call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, &
2546 0 : localsize,baseoffset,localoffset)
2547 :
2548 0 : coffset = 1
2549 0 : zoffset = coffset + k
2550 0 : yoffset = zoffset + k
2551 0 : voffset = yoffset + k
2552 0 : buffersize = k - sidx
2553 :
2554 : ! finalize tau values
2555 0 : alpha = seedC(k-sidx+1,k-sidx+1)
2556 0 : beta = tau(k-sidx+1)
2557 :
2558 : ! zero Householder Vector (zero norm) case
2559 : !print *,'k update: alpha,beta',alpha,beta
2560 : #ifdef DOUBLE_PRECISION_REAL
2561 0 : if ((beta .eq. 0.0_rk8) .or. (alpha .eq. 0.0_rk8)) then
2562 0 : tau(k-sidx+1) = 0.0_rk8
2563 0 : seedC(k,k-sidx+1) = 0.0_rk8
2564 : #else
2565 0 : if ((beta .eq. 0.0_rk4) .or. (alpha .eq. 0.0_rk4)) then
2566 0 : tau(k-sidx+1) = 0.0_rk4
2567 0 : seedC(k,k-sidx+1) = 0.0_rk4
2568 : #endif
2569 :
2570 : call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
2571 : &PRECISION&
2572 0 : &")
2573 0 : return
2574 : end if
2575 :
2576 0 : tau(k-sidx+1) = (beta+alpha) / beta
2577 :
2578 : ! ---------------------------------------
2579 : ! calculate c Vector (extra Vector or encode in seedC/seedD?
2580 0 : work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
2581 : #ifdef DOUBLE_PRECISION_REAL
2582 : call dgemv("Trans", buffersize+1, buffersize, &
2583 : 1.0_rk8,seedC(1,1),k,seedC(1,k-sidx+1),1, &
2584 0 : 1.0_rk8,work(coffset),1)
2585 :
2586 : ! calculate z using tau,seedD,seedC and c Vector
2587 0 : work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
2588 0 : call daxpy(buffersize, 1.0_rk8/beta, work(coffset), 1, work(zoffset), 1)
2589 :
2590 : ! update A1(local copy) and generate part of householder vectors for use
2591 0 : call daxpy(buffersize, -1.0_rk8, work(zoffset),1,seedC(k-sidx+1,1),k)
2592 0 : call dscal(buffersize, 1.0_rk8/(alpha+beta), seedC(1,k-sidx+1),1)
2593 0 : call dger(buffersize, buffersize, -1.0_rk8, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
2594 :
2595 : ! update A global (householder Vector already generated by pdlarfgk)
2596 0 : mpirank_top = MOD(lidx/nb,mpiprocs)
2597 0 : if (mpirank .eq. mpirank_top) then
2598 : ! handle first row separately
2599 0 : topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
2600 0 : call daxpy(buffersize,-1.0_rk8,work(zoffset),1,a(topidx,1),lda)
2601 : end if
2602 :
2603 : call dger(localsize, buffersize,-1.0_rk8, &
2604 : a(localoffset,k-sidx+1),1,work(zoffset),1, &
2605 0 : a(localoffset,1),lda)
2606 :
2607 : ! update D (symmetric) => two buffer vectors of size rank
2608 : ! generate y Vector
2609 0 : work(yoffset:yoffset+buffersize-1) = 0._rk8
2610 0 : call daxpy(buffersize,1.0_rk8/(alpha+beta),work(zoffset),1,work(yoffset),1)
2611 :
2612 : ! generate v Vector
2613 0 : work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
2614 0 : call daxpy(buffersize, -0.5_rk8*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
2615 :
2616 : ! symmetric update of D using y and v
2617 : call dsyr2("Upper", buffersize,-1.0_rk8, &
2618 : work(yoffset),1,work(voffset),1, &
2619 0 : seedD(1,1), k)
2620 :
2621 : ! prepare T matrix inner products
2622 : ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
2623 : ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
2624 0 : call dger(k-sidx,sidx,-1.0_rk8,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
2625 0 : seedC(k,k-sidx+1) = 1.0_rk8/(alpha+beta)
2626 : #else /* DOUBLE_PRECISION_REAL */
2627 : call sgemv("Trans", buffersize+1, buffersize, &
2628 : 1.0_rk4,seedC(1,1),k,seedC(1,k-sidx+1),1, &
2629 0 : 1.0_rk4,work(coffset),1)
2630 :
2631 : ! calculate z using tau,seedD,seedC and c Vector
2632 0 : work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
2633 0 : call saxpy(buffersize, 1.0_rk4/beta, work(coffset), 1, work(zoffset), 1)
2634 :
2635 : ! update A1(local copy) and generate part of householder vectors for use
2636 0 : call saxpy(buffersize, -1.0_rk4, work(zoffset),1,seedC(k-sidx+1,1),k)
2637 0 : call sscal(buffersize, 1.0_rk4/(alpha+beta), seedC(1,k-sidx+1),1)
2638 0 : call sger(buffersize, buffersize, -1.0_rk4, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
2639 :
2640 : ! update A global (householder Vector already generated by pdlarfgk)
2641 0 : mpirank_top = MOD(lidx/nb,mpiprocs)
2642 0 : if (mpirank .eq. mpirank_top) then
2643 : ! handle first row separately
2644 0 : topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
2645 0 : call saxpy(buffersize,-1.0_rk4,work(zoffset),1,a(topidx,1),lda)
2646 : end if
2647 :
2648 : call sger(localsize, buffersize,-1.0_rk4, &
2649 : a(localoffset,k-sidx+1),1,work(zoffset),1, &
2650 0 : a(localoffset,1),lda)
2651 :
2652 : ! update D (symmetric) => two buffer vectors of size rank
2653 : ! generate y Vector
2654 0 : work(yoffset:yoffset+buffersize-1) = 0._rk4
2655 0 : call saxpy(buffersize,1.0_rk4/(alpha+beta),work(zoffset),1,work(yoffset),1)
2656 :
2657 : ! generate v Vector
2658 0 : work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
2659 0 : call saxpy(buffersize, -0.5_rk4*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
2660 :
2661 : ! symmetric update of D using y and v
2662 : call ssyr2("Upper", buffersize,-1.0_rk4, &
2663 : work(yoffset),1,work(voffset),1, &
2664 0 : seedD(1,1), k)
2665 :
2666 : ! prepare T matrix inner products
2667 : ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
2668 : ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
2669 0 : call sger(k-sidx,sidx,-1.0_rk4,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
2670 0 : seedC(k,k-sidx+1) = 1.0_rk4/(alpha+beta)
2671 : #endif /* DOUBLE_PRECISION_REAL */
2672 :
2673 : call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
2674 : &PRECISION&
2675 0 : &")
2676 : end subroutine
2677 :
2678 : subroutine qr_pdlarfgk_1dcomm_generateT_&
2679 0 : &PRECISION &
2680 0 : (obj,seedC,seedD,k,actualk,tau,t,ldt)
2681 : use precision
2682 : use elpa_abstract_impl
2683 : implicit none
2684 : class(elpa_abstract_impl_t), intent(inout) :: obj
2685 : integer(kind=ik) :: k,actualk,ldt
2686 : real(kind=C_DATATYPE_KIND) :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*)
2687 :
2688 : integer(kind=ik) :: irow,icol
2689 : real(kind=C_DATATYPE_KIND) :: column_coefficient
2690 : call obj%timer%start("qr_pdlarfgk_1dcomm_generateT_&
2691 : &PRECISION&
2692 0 : &")
2693 :
2694 : !print *,'reversed on the fly T generation NYI'
2695 :
2696 0 : do icol=1,actualk-1
2697 : ! calculate inner product of householder Vector parts in seedC
2698 : ! (actually calculating more than necessary, if actualk < k)
2699 : ! => a lot of junk from row 1 to row k-actualk
2700 : #ifdef DOUBLE_PRECISION_REAL
2701 0 : call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
2702 : #else
2703 0 : call strmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
2704 : #endif
2705 : ! add scaled D parts to current column of C (will become later T rows)
2706 0 : column_coefficient = seedC(k,k-icol+1)
2707 0 : do irow=k-actualk+1,k-1
2708 0 : seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) + ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) )
2709 : end do
2710 : end do
2711 :
2712 : call qr_dlarft_kernel_&
2713 : &PRECISION &
2714 0 : (actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt)
2715 : call obj%timer%stop("qr_pdlarfgk_1dcomm_generateT_&
2716 : &PRECISION&
2717 0 : &")
2718 :
2719 0 : end subroutine
2720 :
2721 : !direction=0: pack into work buffer
2722 : !direction=1: unpack from work buffer
2723 : subroutine qr_pdgeqrf_pack_unpack_&
2724 0 : &PRECISION &
2725 0 : (obj,v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm)
2726 : use precision
2727 : use elpa1_impl
2728 : use qr_utils_mod
2729 : use elpa_abstract_impl
2730 : implicit none
2731 : class(elpa_abstract_impl_t), intent(inout) :: obj
2732 : ! input variables (local)
2733 : integer(kind=ik) :: ldv,lwork
2734 : real(kind=C_DATATYPE_KIND) :: v(ldv,*), work(*)
2735 :
2736 : ! input variables (global)
2737 : integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm
2738 :
2739 : ! output variables (global)
2740 :
2741 : ! local scalars
2742 : integer(kind=ik) :: mpierr,mpirank,mpiprocs
2743 : integer(kind=ik) :: buffersize,icol
2744 : integer(kind=ik) :: local_size,baseoffset,offset
2745 :
2746 : ! external functions
2747 : call obj%timer%start("qr_pdgeqrf_pack_unpack_&
2748 : &PRECISION&
2749 0 : &")
2750 0 : call mpi_comm_rank(mpicomm,mpirank,mpierr)
2751 0 : call mpi_comm_size(mpicomm,mpiprocs,mpierr)
2752 : call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
2753 0 : local_size,baseoffset,offset)
2754 :
2755 : !print *,'pack/unpack',local_size,baseoffset,offset
2756 :
2757 : ! rough approximate for buffer size
2758 0 : if (lwork .eq. -1) then
2759 0 : buffersize = local_size * n ! Vector elements
2760 0 : work(1) = DBLE(buffersize)
2761 : call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
2762 : &PRECISION&
2763 0 : &")
2764 :
2765 0 : return
2766 : end if
2767 :
2768 0 : if (direction .eq. 0) then
2769 : ! copy v part to buffer (including zeros)
2770 0 : do icol=1,n
2771 0 : work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol)
2772 : end do
2773 : else
2774 : ! copy v part from buffer (including zeros)
2775 0 : do icol=1,n
2776 0 : v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol)
2777 : end do
2778 : end if
2779 : call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
2780 : &PRECISION&
2781 0 : &")
2782 :
2783 0 : return
2784 :
2785 : end subroutine
2786 :
2787 : !direction=0: pack into work buffer
2788 : !direction=1: unpack from work buffer
2789 : subroutine qr_pdgeqrf_pack_unpack_tmatrix_&
2790 0 : &PRECISION &
2791 0 : (obj,tau,t,ldt,work,lwork,n,direction)
2792 : use precision
2793 : use elpa1_impl
2794 : use qr_utils_mod
2795 : use elpa_abstract_impl
2796 : implicit none
2797 : class(elpa_abstract_impl_t), intent(inout) :: obj
2798 : ! input variables (local)
2799 : integer(kind=ik) :: ldt,lwork
2800 : real(kind=C_DATATYPE_KIND) :: work(*), t(ldt,*),tau(*)
2801 :
2802 : ! input variables (global)
2803 : integer(kind=ik) :: n,direction
2804 :
2805 : ! output variables (global)
2806 :
2807 : ! local scalars
2808 : integer(kind=ik) :: icol
2809 :
2810 : ! external functions
2811 : call obj%timer%start("qr_pdgeqrf_pack_unpack_tmatrix_&
2812 : &PRECISION&
2813 0 : &")
2814 :
2815 0 : if (lwork .eq. -1) then
2816 : #ifdef DOUBLE_PRECISION_REAL
2817 0 : work(1) = real(n*n,kind=C_DATATYPE_KIND)
2818 : #else
2819 0 : work(1) = real(n*n,kind=rk4)
2820 : #endif
2821 :
2822 : call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
2823 : &PRECISION&
2824 0 : &")
2825 0 : return
2826 : end if
2827 :
2828 0 : if (direction .eq. 0) then
2829 : ! append t matrix to buffer (including zeros)
2830 0 : do icol=1,n
2831 0 : work(1+(icol-1)*n:icol*n) = t(1:n,icol)
2832 : end do
2833 : else
2834 : ! append t matrix from buffer (including zeros)
2835 0 : do icol=1,n
2836 0 : t(1:n,icol) = work(1+(icol-1)*n:icol*n)
2837 0 : tau(icol) = t(icol,icol)
2838 : end do
2839 : end if
2840 : call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
2841 : &PRECISION&
2842 0 : &")
2843 : end subroutine
2844 :
2845 :
2846 : #ifndef ALREADY_DEFINED
2847 : ! TODO: encode following functionality
2848 : ! - Direction? BOTTOM UP or TOP DOWN ("Up", "Down")
2849 : ! => influences all related kernels (including DLARFT / DLARFB)
2850 : ! - rank-k parameter (k=1,2,...,b)
2851 : ! => influences possible update strategies
2852 : ! => parameterize the function itself? (FUNCPTR, FUNCARG)
2853 : ! - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels)
2854 : ! - subblocking
2855 : ! (maximum block size bounded by data distribution along rows)
2856 : ! - blocking method (householder vectors only or compact WY?)
2857 : ! - update strategy of trailing parts (incremental, complete)
2858 : ! - difference for subblocks and normal blocks? (UPDATE and UPDATESUB)
2859 : ! o "Incremental"
2860 : ! o "Full"
2861 : ! - final T generation (recursive: subblock wise, block wise, end) (TMERGE)
2862 : ! ' (implicitly given by / influences update strategies?)
2863 : ! => alternative: during update: iterate over sub t parts
2864 : ! => advantage: smaller (cache aware T parts)
2865 : ! => disadvantage: more memory write backs
2866 : ! (number of T parts * matrix elements)
2867 : ! - partial/sub T generation (TGEN)
2868 : ! o add vectors right after creation (Vector)
2869 : ! o add set of vectors (Set)
2870 : ! - bcast strategy of householder vectors to other process columns
2871 : ! (influences T matrix generation and trailing update
2872 : ! in other process columns)
2873 : ! o no broadcast (NONE = benchmarking?,
2874 : ! or not needed due to 1D process grid)
2875 : ! o after every housegen (VECTOR)
2876 : ! o after every subblk (SUBBLOCK)
2877 : ! o after full local column block decomposition (BLOCK)
2878 : ! LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft
2879 :
2880 : !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, &
2881 : ! SUBBLK, UPDATE, TGEN, BCAST)
2882 : ! gmode: control communication pattern of dlarfg
2883 : ! maxrank: control max number of householder vectors per communication
2884 : ! eps: error threshold (integer)
2885 : ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge')
2886 : ! merging = full update with tmatrix merging
2887 : ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge
2888 : ! only matters if update* == full
2889 0 : subroutine qr_pqrparam_init(obj,pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode)
2890 : use precision
2891 : use elpa_abstract_impl
2892 : implicit none
2893 : class(elpa_abstract_impl_t), intent(inout) :: obj
2894 : ! input
2895 : CHARACTER :: update2d,update1d,update,hgmode
2896 : INTEGER(kind=ik) :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d
2897 :
2898 : ! output
2899 : #ifdef USE_ASSUMED_SIZE_QR
2900 : INTEGER(kind=ik) :: PQRPARAM(*)
2901 : #else
2902 : INTEGER(kind=ik) :: PQRPARAM(1:11)
2903 : #endif
2904 :
2905 0 : call obj%timer%start("qr_pqrparam_init")
2906 :
2907 0 : PQRPARAM(1) = size2d
2908 0 : PQRPARAM(2) = ichar(update2d)
2909 0 : PQRPARAM(3) = tmerge2d
2910 : ! TODO: broadcast T yes/no
2911 :
2912 0 : PQRPARAM(4) = size1d
2913 0 : PQRPARAM(5) = ichar(update1d)
2914 0 : PQRPARAM(6) = tmerge1d
2915 :
2916 0 : PQRPARAM(7) = maxrank
2917 0 : PQRPARAM(8) = ichar(update)
2918 0 : PQRPARAM(9) = eps
2919 0 : PQRPARAM(10) = ichar(hgmode)
2920 0 : call obj%timer%stop("qr_pqrparam_init")
2921 :
2922 0 : end subroutine qr_pqrparam_init
2923 : #endif /* ALREADY_DEFINED */
2924 :
2925 : subroutine qr_pdlarfg_copy_1dcomm_&
2926 0 : &PRECISION &
2927 : (obj,x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm)
2928 : use precision
2929 : use elpa1_impl
2930 : use qr_utils_mod
2931 : use elpa_abstract_impl
2932 : implicit none
2933 : class(elpa_abstract_impl_t), intent(inout) :: obj
2934 : ! input variables (local)
2935 : integer(kind=ik) :: incx,incv
2936 : real(kind=C_DATATYPE_KIND) :: x(*), v(*)
2937 :
2938 : ! input variables (global)
2939 : integer(kind=ik) :: baseidx,idx,rev,nb,n
2940 : integer(kind=ik) :: mpicomm
2941 :
2942 : ! output variables (global)
2943 :
2944 : ! local scalars
2945 : integer(kind=ik) :: mpierr,mpiprocs
2946 : integer(kind=ik) :: mpirank,mpirank_top
2947 : integer(kind=ik) :: irow,x_offset
2948 : integer(kind=ik) :: v_offset,local_size
2949 :
2950 : call obj%timer%start("qr_pdlarfg_copy_1dcomm_&
2951 : &PRECISION&
2952 0 : &")
2953 0 : call MPI_Comm_rank(mpicomm, mpirank, mpierr)
2954 0 : call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
2955 : call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
2956 0 : local_size,v_offset,x_offset)
2957 0 : v_offset = v_offset * incv
2958 :
2959 : !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size
2960 :
2961 : ! copy elements
2962 0 : do irow=1,local_size
2963 0 : v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset)
2964 : end do
2965 :
2966 : ! replace top element to build an unitary Vector
2967 0 : mpirank_top = MOD((idx-1)/nb,mpiprocs)
2968 0 : if (mpirank .eq. mpirank_top) then
2969 : #ifdef DOUBLE_PRECISION_REAL
2970 0 : v(local_size*incv) = 1.0_rk8
2971 : #else
2972 0 : v(local_size*incv) = 1.0_rk4
2973 : #endif
2974 : end if
2975 : call obj%timer%stop("qr_pdlarfg_copy_1dcomm_&
2976 : &PRECISION&
2977 0 : &")
2978 :
2979 0 : end subroutine
2980 :
2981 : ! vim: syntax=fortran
|