Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8 : ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10 : ! Informatik,
11 : ! - Technische Universität München, Lehrstuhl für Informatik mit
12 : ! Schwerpunkt Wissenschaftliches Rechnen ,
13 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16 : ! and
17 : ! - IBM Deutschland GmbH
18 : !
19 : !
20 : ! More information can be found here:
21 : ! http://elpa.mpcdf.mpg.de/
22 : !
23 : ! ELPA is free software: you can redistribute it and/or modify
24 : ! it under the terms of the version 3 of the license of the
25 : ! GNU Lesser General Public License as published by the Free
26 : ! Software Foundation.
27 : !
28 : ! ELPA is distributed in the hope that it will be useful,
29 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
30 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 : ! GNU Lesser General Public License for more details.
32 : !
33 : ! You should have received a copy of the GNU Lesser General Public License
34 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
35 : !
36 : ! ELPA reflects a substantial effort on the part of the original
37 : ! ELPA consortium, and we ask you to respect the spirit of the
38 : ! license that we chose: i.e., please contribute any changes you
39 : ! may have back to the original ELPA library distribution, and keep
40 : ! any derivatives of ELPA under the same license that we chose for
41 : ! the original distribution, the GNU Lesser General Public License.
42 : !
43 :
44 : #endif
45 :
46 : ! calculates A = A - Y*T'*Z (rev=0)
47 : ! calculates A = A - Y*T*Z (rev=1)
48 : ! T upper triangle matrix
49 : ! assuming zero entries in matrix in upper kxk block
50 :
51 : subroutine qr_pdlarfb_kernel_local_&
52 0 : &PRECISION &
53 0 : (m,n,k,a,lda,v,ldv,t,ldt,z,ldz)
54 : use precision
55 : implicit none
56 :
57 : ! input variables (local)
58 : integer(kind=ik) :: lda,ldv,ldt,ldz
59 : real(kind=REAL_DATATYPE) :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*)
60 :
61 : ! input variables (global)
62 : integer(kind=ik) :: m,n,k
63 :
64 : ! local variables
65 : real(kind=REAL_DATATYPE) :: t11
66 : real(kind=REAL_DATATYPE) :: t12,t22,sum1,sum2
67 : real(kind=REAL_DATATYPE) :: t13,t23,t33,sum3
68 : real(kind=REAL_DATATYPE) :: sum4,t44
69 : real(kind=REAL_DATATYPE) :: y1,y2,y3,y4
70 : real(kind=REAL_DATATYPE) :: a1
71 : integer(kind=ik) :: icol,irow,v1col,v2col,v3col
72 :
73 : ! reference implementation
74 0 : if (k .eq. 1) then
75 0 : t11 = t(1,1)
76 0 : do icol=1,n
77 0 : sum1 = z(1,icol)
78 0 : a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1)
79 : enddo
80 0 : return
81 0 : else if (k .eq. 2) then
82 0 : v1col = 2
83 0 : v2col = 1
84 0 : t22 = t(1,1)
85 0 : t12 = t(1,2)
86 0 : t11 = t(2,2)
87 :
88 0 : do icol=1,n
89 0 : sum1 = t11 * z(v1col,icol)
90 0 : sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol)
91 :
92 0 : do irow=1,m
93 0 : a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2
94 : end do
95 : end do
96 0 : else if (k .eq. 3) then
97 0 : v1col = 3
98 0 : v2col = 2
99 0 : v3col = 1
100 :
101 0 : t33 = t(1,1)
102 :
103 0 : t23 = t(1,2)
104 0 : t22 = t(2,2)
105 :
106 0 : t13 = t(1,3)
107 0 : t12 = t(2,3)
108 0 : t11 = t(3,3)
109 :
110 0 : do icol=1,n
111 : ! misusing variables for fetch of z parts
112 0 : y1 = z(v1col,icol)
113 0 : y2 = z(v2col,icol)
114 0 : y3 = z(v3col,icol)
115 :
116 0 : sum1 = t11 * y1!+ 0 * y2!+ 0 * y3
117 0 : sum2 = t12 * y1 + t22 * y2!+ 0 * y3
118 0 : sum3 = t13 * y1 + t23 * y2 + t33 * y3
119 :
120 0 : do irow=1,m
121 0 : a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3
122 : end do
123 : end do
124 0 : else if (k .eq. 4) then
125 0 : do icol=1,n
126 : ! misusing variables for fetch of z parts
127 0 : y1 = z(1,icol)
128 0 : y2 = z(2,icol)
129 0 : y3 = z(3,icol)
130 0 : y4 = z(4,icol)
131 :
132 : ! dtrmv like - starting from main diagonal and working
133 : ! upwards
134 0 : t11 = t(1,1)
135 0 : t22 = t(2,2)
136 0 : t33 = t(3,3)
137 0 : t44 = t(4,4)
138 :
139 0 : sum1 = t11 * y1
140 0 : sum2 = t22 * y2
141 0 : sum3 = t33 * y3
142 0 : sum4 = t44 * y4
143 :
144 0 : t11 = t(1,2)
145 0 : t22 = t(2,3)
146 0 : t33 = t(3,4)
147 :
148 0 : sum1 = sum1 + t11 * y2
149 0 : sum2 = sum2 + t22 * y3
150 0 : sum3 = sum3 + t33 * y4
151 :
152 0 : t11 = t(1,3)
153 0 : t22 = t(2,4)
154 :
155 0 : sum1 = sum1 + t11 * y3
156 0 : sum2 = sum2 + t22 * y4
157 :
158 0 : t11 = t(1,4)
159 0 : sum1 = sum1 + t11 * y4
160 :
161 : ! one column of V is calculated
162 : ! time to calculate A - Y * V
163 0 : do irow=1,m ! TODO: loop unrolling
164 0 : y1 = v(irow,1)
165 0 : y2 = v(irow,2)
166 0 : y3 = v(irow,3)
167 0 : y4 = v(irow,4)
168 :
169 0 : a1 = a(irow,icol)
170 :
171 0 : a1 = a1 - y1*sum1
172 0 : a1 = a1 - y2*sum2
173 0 : a1 = a1 - y3*sum3
174 0 : a1 = a1 - y4*sum4
175 :
176 0 : a(irow,icol) = a1
177 : end do
178 : end do
179 : else
180 : ! reference implementation
181 : #ifdef DOUBLE_PRECISION_REAL
182 : ! V' = T * Z'
183 0 : call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk8,t,ldt,z,ldz)
184 : ! A = A - Y * V'
185 0 : call dgemm("Notrans","Notrans",m,n,k,-1.0_rk8,v,ldv,z,ldz,1.0_rk8,a,lda)
186 : #else
187 : ! V' = T * Z'
188 0 : call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk4,t,ldt,z,ldz)
189 : ! A = A - Y * V'
190 0 : call dgemm("Notrans","Notrans",m,n,k,-1.0_rk4,v,ldv,z,ldz,1.0_rk4,a,lda)
191 : #endif
192 : end if
193 :
194 : end subroutine
195 :
196 : subroutine qr_pdlarft_merge_kernel_local_&
197 0 : &PRECISION &
198 0 : (oldk,k,t,ldt,yty,ldy)
199 : use precision
200 : implicit none
201 :
202 : ! input variables (local)
203 : integer(kind=ik) :: ldt,ldy
204 : real(kind=REAL_DATATYPE) :: t(ldt,*),yty(ldy,*)
205 :
206 : ! input variables (global)
207 : integer(kind=ik) :: k,oldk
208 :
209 : ! output variables (global)
210 :
211 : ! local scalars
212 : integer(kind=ik) :: icol,leftk,rightk
213 :
214 : ! local scalars for optimized versions
215 : integer(kind=ik) :: irow
216 : real(kind=REAL_DATATYPE) :: t11
217 : real(kind=REAL_DATATYPE) :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8
218 : real(kind=REAL_DATATYPE) :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08
219 : real(kind=REAL_DATATYPE) :: final01,final02,final03,final04,final05,final06,final07,final08
220 :
221 0 : if (oldk .eq. 0) return ! nothing to be done
222 :
223 0 : leftk = k
224 0 : rightk = oldk
225 :
226 : ! optimized implementations:
227 0 : if (leftk .eq. 1) then
228 0 : do icol=1,rightk
229 : ! multiply inner products with right t matrix
230 : ! (dtrmv like)
231 0 : yty1 = yty(1,1)
232 0 : t11 = t(leftk+1,leftk+icol)
233 :
234 0 : reg01 = yty1 * t11
235 :
236 0 : do irow=2,icol
237 0 : yty1 = yty(1,irow)
238 0 : t11 = t(leftk+irow,leftk+icol)
239 :
240 0 : reg01 = reg01 + yty1 * t11
241 : end do
242 :
243 : ! multiply intermediate results with left t matrix and store in final t
244 : ! matrix
245 0 : t11 = -t(1,1)
246 0 : final01 = t11 * reg01
247 0 : t(1,leftk+icol) = final01
248 : end do
249 :
250 : !print *,'efficient tmerge - leftk=1'
251 0 : else if (leftk .eq. 2) then
252 0 : do icol=1,rightk
253 : ! multiply inner products with right t matrix
254 : ! (dtrmv like)
255 0 : yty1 = yty(1,1)
256 0 : yty2 = yty(2,1)
257 :
258 0 : t11 = t(leftk+1,leftk+icol)
259 :
260 0 : reg01 = yty1 * t11
261 0 : reg02 = yty2 * t11
262 :
263 0 : do irow=2,icol
264 0 : yty1 = yty(1,irow)
265 0 : yty2 = yty(2,irow)
266 0 : t11 = t(leftk+irow,leftk+icol)
267 :
268 0 : reg01 = reg01 + yty1 * t11
269 0 : reg02 = reg02 + yty2 * t11
270 : end do
271 :
272 : ! multiply intermediate results with left t matrix and store in final t
273 : ! matrix
274 0 : yty1 = -t(1,1)
275 0 : yty2 = -t(1,2)
276 0 : yty3 = -t(2,2)
277 :
278 0 : final01 = reg02 * yty2
279 0 : final02 = reg02 * yty3
280 :
281 0 : final01 = final01 + reg01 * yty1
282 :
283 0 : t(1,leftk+icol) = final01
284 0 : t(2,leftk+icol) = final02
285 : end do
286 :
287 : !print *,'efficient tmerge - leftk=2'
288 0 : else if (leftk .eq. 4) then
289 0 : do icol=1,rightk
290 : ! multiply inner products with right t matrix
291 : ! (dtrmv like)
292 0 : yty1 = yty(1,1)
293 0 : yty2 = yty(2,1)
294 0 : yty3 = yty(3,1)
295 0 : yty4 = yty(4,1)
296 :
297 0 : t11 = t(leftk+1,leftk+icol)
298 :
299 0 : reg01 = yty1 * t11
300 0 : reg02 = yty2 * t11
301 0 : reg03 = yty3 * t11
302 0 : reg04 = yty4 * t11
303 :
304 0 : do irow=2,icol
305 0 : yty1 = yty(1,irow)
306 0 : yty2 = yty(2,irow)
307 0 : yty3 = yty(3,irow)
308 0 : yty4 = yty(4,irow)
309 :
310 0 : t11 = t(leftk+irow,leftk+icol)
311 :
312 0 : reg01 = reg01 + yty1 * t11
313 0 : reg02 = reg02 + yty2 * t11
314 0 : reg03 = reg03 + yty3 * t11
315 0 : reg04 = reg04 + yty4 * t11
316 : end do
317 :
318 : ! multiply intermediate results with left t matrix and store in final t
319 : ! matrix (start from diagonal and move upwards)
320 0 : yty1 = -t(1,1)
321 0 : yty2 = -t(2,2)
322 0 : yty3 = -t(3,3)
323 0 : yty4 = -t(4,4)
324 :
325 : ! main diagonal
326 0 : final01 = reg01 * yty1
327 0 : final02 = reg02 * yty2
328 0 : final03 = reg03 * yty3
329 0 : final04 = reg04 * yty4
330 :
331 : ! above main diagonal
332 0 : yty1 = -t(1,2)
333 0 : yty2 = -t(2,3)
334 0 : yty3 = -t(3,4)
335 :
336 0 : final01 = final01 + reg02 * yty1
337 0 : final02 = final02 + reg03 * yty2
338 0 : final03 = final03 + reg04 * yty3
339 :
340 : ! above first side diagonal
341 0 : yty1 = -t(1,3)
342 0 : yty2 = -t(2,4)
343 :
344 0 : final01 = final01 + reg03 * yty1
345 0 : final02 = final02 + reg04 * yty2
346 :
347 : ! above second side diagonal
348 0 : yty1 = -t(1,4)
349 :
350 0 : final01 = final01 + reg04 * yty1
351 :
352 : ! write back to final matrix
353 0 : t(1,leftk+icol) = final01
354 0 : t(2,leftk+icol) = final02
355 0 : t(3,leftk+icol) = final03
356 0 : t(4,leftk+icol) = final04
357 : end do
358 :
359 : !print *,'efficient tmerge - leftk=4'
360 0 : else if (leftk .eq. 8) then
361 0 : do icol=1,rightk
362 : ! multiply inner products with right t matrix
363 : ! (dtrmv like)
364 0 : yty1 = yty(1,1)
365 0 : yty2 = yty(2,1)
366 0 : yty3 = yty(3,1)
367 0 : yty4 = yty(4,1)
368 0 : yty5 = yty(5,1)
369 0 : yty6 = yty(6,1)
370 0 : yty7 = yty(7,1)
371 0 : yty8 = yty(8,1)
372 :
373 0 : t11 = t(leftk+1,leftk+icol)
374 :
375 0 : reg01 = yty1 * t11
376 0 : reg02 = yty2 * t11
377 0 : reg03 = yty3 * t11
378 0 : reg04 = yty4 * t11
379 0 : reg05 = yty5 * t11
380 0 : reg06 = yty6 * t11
381 0 : reg07 = yty7 * t11
382 0 : reg08 = yty8 * t11
383 :
384 0 : do irow=2,icol
385 0 : yty1 = yty(1,irow)
386 0 : yty2 = yty(2,irow)
387 0 : yty3 = yty(3,irow)
388 0 : yty4 = yty(4,irow)
389 0 : yty5 = yty(5,irow)
390 0 : yty6 = yty(6,irow)
391 0 : yty7 = yty(7,irow)
392 0 : yty8 = yty(8,irow)
393 :
394 0 : t11 = t(leftk+irow,leftk+icol)
395 :
396 0 : reg01 = reg01 + yty1 * t11
397 0 : reg02 = reg02 + yty2 * t11
398 0 : reg03 = reg03 + yty3 * t11
399 0 : reg04 = reg04 + yty4 * t11
400 0 : reg05 = reg05 + yty5 * t11
401 0 : reg06 = reg06 + yty6 * t11
402 0 : reg07 = reg07 + yty7 * t11
403 0 : reg08 = reg08 + yty8 * t11
404 : end do
405 :
406 : ! multiply intermediate results with left t matrix and store in final t
407 : ! matrix (start from diagonal and move upwards)
408 0 : yty1 = -t(1,1)
409 0 : yty2 = -t(2,2)
410 0 : yty3 = -t(3,3)
411 0 : yty4 = -t(4,4)
412 0 : yty5 = -t(5,5)
413 0 : yty6 = -t(6,6)
414 0 : yty7 = -t(7,7)
415 0 : yty8 = -t(8,8)
416 :
417 : ! main diagonal
418 0 : final01 = reg01 * yty1
419 0 : final02 = reg02 * yty2
420 0 : final03 = reg03 * yty3
421 0 : final04 = reg04 * yty4
422 0 : final05 = reg05 * yty5
423 0 : final06 = reg06 * yty6
424 0 : final07 = reg07 * yty7
425 0 : final08 = reg08 * yty8
426 :
427 : ! above main diagonal
428 0 : yty1 = -t(1,2)
429 0 : yty2 = -t(2,3)
430 0 : yty3 = -t(3,4)
431 0 : yty4 = -t(4,5)
432 0 : yty5 = -t(5,6)
433 0 : yty6 = -t(6,7)
434 0 : yty7 = -t(7,8)
435 :
436 0 : final01 = final01 + reg02 * yty1
437 0 : final02 = final02 + reg03 * yty2
438 0 : final03 = final03 + reg04 * yty3
439 0 : final04 = final04 + reg05 * yty4
440 0 : final05 = final05 + reg06 * yty5
441 0 : final06 = final06 + reg07 * yty6
442 0 : final07 = final07 + reg08 * yty7
443 :
444 : ! above first side diagonal
445 0 : yty1 = -t(1,3)
446 0 : yty2 = -t(2,4)
447 0 : yty3 = -t(3,5)
448 0 : yty4 = -t(4,6)
449 0 : yty5 = -t(5,7)
450 0 : yty6 = -t(6,8)
451 :
452 0 : final01 = final01 + reg03 * yty1
453 0 : final02 = final02 + reg04 * yty2
454 0 : final03 = final03 + reg05 * yty3
455 0 : final04 = final04 + reg06 * yty4
456 0 : final05 = final05 + reg07 * yty5
457 0 : final06 = final06 + reg08 * yty6
458 :
459 : !above second side diagonal
460 :
461 0 : yty1 = -t(1,4)
462 0 : yty2 = -t(2,5)
463 0 : yty3 = -t(3,6)
464 0 : yty4 = -t(4,7)
465 0 : yty5 = -t(5,8)
466 :
467 0 : final01 = final01 + reg04 * yty1
468 0 : final02 = final02 + reg05 * yty2
469 0 : final03 = final03 + reg06 * yty3
470 0 : final04 = final04 + reg07 * yty4
471 0 : final05 = final05 + reg08 * yty5
472 :
473 : ! i think you got the idea by now
474 :
475 0 : yty1 = -t(1,5)
476 0 : yty2 = -t(2,6)
477 0 : yty3 = -t(3,7)
478 0 : yty4 = -t(4,8)
479 :
480 0 : final01 = final01 + reg05 * yty1
481 0 : final02 = final02 + reg06 * yty2
482 0 : final03 = final03 + reg07 * yty3
483 0 : final04 = final04 + reg08 * yty4
484 :
485 : ! .....
486 :
487 0 : yty1 = -t(1,6)
488 0 : yty2 = -t(2,7)
489 0 : yty3 = -t(3,8)
490 :
491 0 : final01 = final01 + reg06 * yty1
492 0 : final02 = final02 + reg07 * yty2
493 0 : final03 = final03 + reg08 * yty3
494 :
495 : ! .....
496 :
497 0 : yty1 = -t(1,7)
498 0 : yty2 = -t(2,8)
499 :
500 0 : final01 = final01 + reg07 * yty1
501 0 : final02 = final02 + reg08 * yty2
502 :
503 : ! .....
504 :
505 0 : yty1 = -t(1,8)
506 :
507 0 : final01 = final01 + reg08 * yty1
508 :
509 : ! write back to final matrix
510 0 : t(1,leftk+icol) = final01
511 0 : t(2,leftk+icol) = final02
512 0 : t(3,leftk+icol) = final03
513 0 : t(4,leftk+icol) = final04
514 0 : t(5,leftk+icol) = final05
515 0 : t(6,leftk+icol) = final06
516 0 : t(7,leftk+icol) = final07
517 0 : t(8,leftk+icol) = final08
518 : end do
519 :
520 : !print *,'efficient tmerge - leftk=8'
521 : else
522 : ! reference implementation
523 0 : do icol=1,rightk
524 0 : t(1:leftk,leftk+icol) = yty(1:leftk,icol)
525 : end do
526 : #ifdef DOUBLE_PRECISION_REAL
527 : ! -T1 * Y1'*Y2
528 0 : call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk8,t(1,1),ldt,t(1,leftk+1),ldt)
529 : ! (-T1 * Y1'*Y2) * T2
530 0 : call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk8,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
531 : #else
532 : ! -T1 * Y1'*Y2
533 0 : call strmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk4,t(1,1),ldt,t(1,leftk+1),ldt)
534 : ! (-T1 * Y1'*Y2) * T2
535 0 : call strmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk4,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
536 :
537 : #endif
538 : end if
539 :
540 : end subroutine
541 :
542 :
543 : ! yty structure
544 : ! Y1'*Y2 Y1'*Y3 Y1'*Y4 ...
545 : ! 0 Y2'*Y3 Y2'*Y4 ...
546 : ! 0 0 Y3'*Y4 ...
547 : ! 0 0 0 ...
548 :
549 : subroutine qr_tmerge_set_kernel_&
550 0 : &PRECISION &
551 0 : (k,blocksize,t,ldt,yty,ldy)
552 : use precision
553 : implicit none
554 :
555 : ! input variables (local)
556 : integer(kind=ik) :: ldt,ldy
557 : real(kind=REAL_DATATYPE) :: t(ldt,*),yty(ldy,*)
558 :
559 : ! input variables (global)
560 : integer(kind=ik) :: k,blocksize
561 :
562 : ! output variables (global)
563 :
564 : ! local scalars
565 : integer(kind=ik) :: nr_blocks,current_block
566 : integer(kind=ik) :: remainder,oldk
567 : integer(kind=ik) :: yty_column,toffset
568 :
569 0 : if (k .le. blocksize) return ! nothing to merge
570 :
571 0 : nr_blocks = k / blocksize
572 0 : remainder = k - nr_blocks*blocksize
573 :
574 : ! work in "negative" direction:
575 : ! start with latest T matrix part and add older ones
576 0 : toffset = 1
577 0 : yty_column = 1
578 :
579 0 : if (remainder .gt. 0) then
580 : call qr_pdlarft_merge_kernel_local_&
581 : &PRECISION &
582 0 : (blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
583 0 : current_block = 1
584 0 : oldk = remainder+blocksize
585 0 : yty_column = yty_column + blocksize
586 : else
587 : call qr_pdlarft_merge_kernel_local_&
588 : &PRECISION &
589 0 : (blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
590 0 : current_block = 2
591 0 : oldk = 2*blocksize
592 0 : yty_column = yty_column + blocksize
593 : end if
594 :
595 0 : do while (current_block .lt. nr_blocks)
596 : call qr_pdlarft_merge_kernel_local_&
597 : &PRECISION &
598 0 : (blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
599 0 : current_block = current_block + 1
600 0 : oldk = oldk + blocksize
601 0 : yty_column = yty_column + blocksize
602 : end do
603 :
604 : end subroutine
605 : ! yty structure
606 : ! Y1'*Y2 Y1'*Y3 Y1'*Y4 ...
607 : ! 0 Y2'*Y3 Y2'*Y4 ...
608 : ! 0 0 Y3'*Y4 ...
609 : ! 0 0 0 ...
610 :
611 : subroutine qr_tmerge_tree_kernel_&
612 0 : &PRECISION &
613 0 : (k,blocksize,treeorder,t,ldt,yty,ldy)
614 : use precision
615 : implicit none
616 :
617 : ! input variables (local)
618 : integer(kind=ik) :: ldt,ldy
619 : real(kind=REAL_DATATYPE) :: t(ldt,*),yty(ldy,*)
620 :
621 : ! input variables (global)
622 : integer(kind=ik) :: k,blocksize,treeorder
623 :
624 : ! output variables (global)
625 :
626 : ! local scalars
627 : integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks
628 : integer remainder,max_treeorder,remaining_size
629 : integer toffset,yty_column
630 : integer toffset_start,yty_column_start
631 : integer yty_end,total_remainder,yty_remainder
632 :
633 0 : if (treeorder .eq. 0) return ! no merging
634 :
635 0 : if (treeorder .eq. 1) then
636 : call qr_tmerge_set_kernel_&
637 : &PRECISION &
638 0 : (k,blocksize,t,ldt,yty,ldy)
639 0 : return
640 : end if
641 :
642 0 : nr_blocks = k / blocksize
643 0 : max_treeorder = min(nr_blocks,treeorder)
644 :
645 0 : if (max_treeorder .eq. 1) then
646 : call qr_tmerge_set_kernel_&
647 : &PRECISION &
648 0 : (k,blocksize,t,ldt,yty,ldy)
649 0 : return
650 : end if
651 :
652 : ! work in "negative" direction: from latest set to oldest set
653 : ! implementation differs from rev=0 version due to issues with
654 : ! calculating the remainder parts
655 : ! compared to the rev=0 version we split remainder parts directly from
656 : ! parts which can be easily merged in a recursive way
657 :
658 0 : yty_end = (k / blocksize) * blocksize
659 0 : if (yty_end .eq. k) then
660 0 : yty_end = yty_end - blocksize
661 : end if
662 :
663 : !print *,'tree',yty_end,k,blocksize
664 :
665 0 : yty_column_start = 1
666 0 : toffset_start = 1
667 :
668 : ! is there a remainder block?
669 0 : nr_blocks = k / blocksize
670 0 : remainder = k - nr_blocks * blocksize
671 0 : if (remainder .eq. 0) then
672 : !print *,'no initial remainder'
673 :
674 : ! set offsets to the very beginning as there is no remainder part
675 0 : yty_column_start = 1
676 0 : toffset_start = 1
677 0 : total_remainder = 0
678 0 : remaining_size = k
679 0 : yty_remainder = 0
680 : else
681 : !print *,'starting with initial remainder'
682 : ! select submatrix and make remainder block public
683 0 : yty_column_start = 1 + blocksize
684 0 : toffset_start = 1 + remainder
685 0 : total_remainder = remainder
686 0 : remaining_size = k - remainder
687 0 : yty_remainder = 1
688 : end if
689 :
690 : ! from now on it is a clean set of blocks with sizes of multiple of
691 : ! blocksize
692 :
693 0 : temp_blocksize = blocksize
694 :
695 : !-------------------------------
696 0 : do while (remaining_size .gt. 0)
697 0 : nr_blocks = remaining_size / temp_blocksize
698 0 : max_treeorder = min(nr_blocks,treeorder)
699 :
700 0 : if (max_treeorder .eq. 1) then
701 0 : remainder = 0
702 0 : nr_sets = 0
703 0 : setsize = 0
704 :
705 0 : if (yty_remainder .gt. 0) then
706 0 : yty_column = yty_remainder
707 : !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column
708 : call qr_tmerge_set_kernel_&
709 : &PRECISION &
710 0 : (k,temp_blocksize,t,ldt,yty(1,yty_column),ldy)
711 : else
712 : !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size
713 : endif
714 :
715 0 : remaining_size = 0
716 :
717 0 : return ! done
718 : else
719 0 : nr_sets = nr_blocks / max_treeorder
720 0 : setsize = max_treeorder*temp_blocksize
721 0 : remainder = remaining_size - nr_sets*setsize
722 : end if
723 :
724 0 : if (remainder .gt. 0) then
725 0 : if (remainder .gt. temp_blocksize) then
726 0 : toffset = toffset_start
727 0 : yty_column = yty_column_start
728 :
729 : !print *,'set merging', toffset, yty_column,remainder
730 : call qr_tmerge_set_kernel_&
731 : &PRECISION &
732 0 : (remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
733 0 : if (total_remainder .gt. 0) then
734 : ! merge with existing global remainder part
735 : !print *,'single+set merging',yty_remainder,total_remainder,remainder
736 : call qr_pdlarft_merge_kernel_local_&
737 : &PRECISION &
738 0 : (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
739 0 : yty_remainder = yty_remainder + remainder
740 0 : toffset_start = toffset_start + remainder
741 :
742 : !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start
743 :
744 0 : yty_column_start = yty_column_start + remainder
745 : else
746 : ! create new remainder part
747 : !print *,'new remainder+set',yty_remainder
748 0 : yty_remainder = yty_column_start + remainder - temp_blocksize
749 0 : yty_column_start = yty_column_start + remainder
750 0 : toffset_start = toffset_start + remainder
751 : !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start
752 : end if
753 :
754 : else
755 0 : if (total_remainder .gt. 0) then
756 : ! merge with existing global remainder part
757 : !print *,'single merging',yty_remainder,total_remainder,remainder
758 : call qr_pdlarft_merge_kernel_local_&
759 : &PRECISION &
760 0 : (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
761 0 : yty_remainder = yty_remainder + remainder
762 0 : toffset_start = toffset_start + remainder
763 :
764 : !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start
765 :
766 0 : yty_column_start = yty_column_start + remainder
767 : else
768 : ! create new remainder part
769 : !print *,'new remainder',yty_remainder
770 0 : yty_remainder = yty_column_start
771 0 : yty_column_start = yty_column_start + temp_blocksize
772 0 : toffset_start = toffset_start + remainder
773 : !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start
774 : end if
775 : end if
776 :
777 0 : total_remainder = total_remainder + remainder
778 0 : remaining_size = remaining_size - remainder
779 : end if
780 :
781 0 : current_set = 0
782 0 : do while (current_set .lt. nr_sets)
783 0 : toffset = toffset_start + current_set * setsize
784 0 : yty_column = yty_column_start + current_set * setsize
785 :
786 : !print *,'recursive merging', toffset, yty_column,setsize
787 : call qr_tmerge_set_kernel_&
788 : &PRECISION &
789 0 : (setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
790 :
791 0 : current_set = current_set + 1
792 : end do
793 :
794 : !print *,'increasing blocksize', temp_blocksize, setsize
795 0 : yty_column_start = yty_column_start + (setsize - temp_blocksize)
796 0 : temp_blocksize = setsize
797 : end do
798 : end subroutine
799 :
800 :
801 : ! yty should not contain the inner products vi'*vi
802 :
803 : subroutine qr_dlarft_kernel_&
804 0 : &PRECISION &
805 0 : (n,tau,yty,ldy,t,ldt)
806 : use precision
807 : implicit none
808 :
809 : ! input variables
810 : integer(kind=ik) :: n,ldy,ldt
811 : real(kind=REAL_DATATYPE) :: tau(*),yty(ldy,*)
812 :
813 : ! output variables
814 : real(kind=REAL_DATATYPE) :: t(ldt,*)
815 :
816 : ! local variables
817 : integer(kind=ik) :: icol
818 :
819 : ! DEBUG: clear buffer first
820 : !t(1:n,1:n) = 0.0d0
821 :
822 : ! T1 = tau1
823 : ! | tauk Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
824 : ! | 0 Tk-1 |
825 0 : t(n,n) = tau(n)
826 0 : do icol=n-1,1,-1
827 0 : t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1)
828 : #ifdef DOUBLE_PRECISION_REAL
829 0 : call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
830 : #else
831 0 : call strmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
832 : #endif
833 0 : t(icol,icol) = tau(icol)
834 : end do
835 0 : end subroutine
836 :
837 : ! vim: syntax=fortran
|