Line data Source code
1 : #if 0
2 : ! This file is part of ELPA.
3 : !
4 : ! The ELPA library was originally created by the ELPA consortium,
5 : ! consisting of the following organizations:
6 : !
7 : ! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 : ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 : ! Informatik,
10 : ! - Technische Universität München, Lehrstuhl für Informatik mit
11 : ! Schwerpunkt Wissenschaftliches Rechnen ,
12 : ! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 : ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 : ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 : ! and
16 : ! - IBM Deutschland GmbH
17 : !
18 : !
19 : ! More information can be found here:
20 : ! http://elpa.rzg.mpg.de/
21 : !
22 : ! ELPA is free software: you can redistribute it and/or modify
23 : ! it under the terms of the version 3 of the license of the
24 : ! GNU Lesser General Public License as published by the Free
25 : ! Software Foundation.
26 : !
27 : ! ELPA is distributed in the hope that it will be useful,
28 : ! but WITHOUT ANY WARRANTY; without even the implied warranty of
29 : ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 : ! GNU Lesser General Public License for more details.
31 : !
32 : ! You should have received a copy of the GNU Lesser General Public License
33 : ! along with ELPA. If not, see <http://www.gnu.org/licenses/>
34 : !
35 : ! ELPA reflects a substantial effort on the part of the original
36 : ! ELPA consortium, and we ask you to respect the spirit of the
37 : ! license that we chose: i.e., please contribute any changes you
38 : ! may have back to the original ELPA library distribution, and keep
39 : ! any derivatives of ELPA under the same license that we chose for
40 : ! the original distribution, the GNU Lesser General Public License.
41 : !
42 : ! This file was written by A. Marek, MPCDF
43 : #endif
44 :
45 : ! Pack a filled row group (i.e. an array of consecutive rows)
46 :
47 : subroutine pack_row_group_&
48 : &MATH_DATATYPE&
49 : &_gpu_&
50 0 : &PRECISION &
51 : (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
52 0 : rows, n_offset, row_count)
53 : use cuda_c_kernel
54 : use cuda_functions
55 : use precision
56 : use iso_c_binding
57 : implicit none
58 : integer(kind=c_intptr_t) :: row_group_dev, a_dev
59 :
60 : integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
61 : integer(kind=ik), intent(in) :: n_offset, row_count
62 : #if REALCASE == 1
63 : real(kind=C_DATATYPE_KIND) :: rows(:,:)
64 : #endif
65 : #if COMPLEXCASE == 1
66 : complex(kind=C_DATATYPE_KIND):: rows(:,:)
67 : #endif
68 : integer(kind=ik) :: max_idx
69 : logical :: successCUDA
70 :
71 : ! Use many blocks for higher GPU occupancy
72 0 : max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
73 :
74 : ! Use one kernel call to pack the entire row group
75 :
76 : ! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
77 :
78 : call launch_my_pack_gpu_kernel_&
79 : &MATH_DATATYPE&
80 : &_&
81 : &PRECISION &
82 0 : (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
83 :
84 : ! Issue one single transfer call for all rows (device to host)
85 : ! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
86 :
87 : successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_&
88 : &PRECISION&
89 : &_&
90 : &MATH_DATATYPE&
91 0 : & , cudaMemcpyDeviceToHost)
92 0 : if (.not.(successCUDA)) then
93 : print *,"pack_row_group_&
94 : &MATH_DATATYPE&
95 : &_gpu_&
96 : &PRECISION&
97 0 : &: error in cudaMemcpy"
98 0 : stop 1
99 : endif
100 :
101 0 : end subroutine
102 :
103 :
104 : ! Unpack a filled row group (i.e. an array of consecutive rows)
105 : subroutine unpack_row_group_&
106 : &MATH_DATATYPE&
107 : &_gpu_&
108 0 : &PRECISION &
109 : (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
110 0 : a_dim2, l_nev, rows, n_offset, row_count)
111 : use cuda_c_kernel
112 : use precision
113 : use iso_c_binding
114 : use cuda_functions
115 : implicit none
116 : integer(kind=c_intptr_t) :: row_group_dev, a_dev
117 : integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
118 : integer(kind=ik), intent(in) :: n_offset, row_count
119 : #if REALCASE == 1
120 : real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
121 : #endif
122 : #if COMPLEXCASE == 1
123 : complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
124 : #endif
125 :
126 : integer(kind=ik) :: max_idx
127 : logical :: successCUDA
128 :
129 : ! Use many blocks for higher GPU occupancy
130 0 : max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
131 :
132 : ! Issue one single transfer call for all rows (host to device)
133 : ! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
134 :
135 :
136 : successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
137 : size_of_&
138 : &PRECISION&
139 : &_&
140 : &MATH_DATATYPE&
141 0 : &, cudaMemcpyHostToDevice)
142 0 : if (.not.(successCUDA)) then
143 : print *,"unpack_row_group_&
144 : &MATH_DATATYPE&
145 : &_gpu_&
146 : &PRECISION&
147 0 : &: error in cudaMemcpy"
148 0 : stop 1
149 : endif
150 :
151 : ! Use one kernel call to pack the entire row group
152 : ! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
153 :
154 : call launch_my_unpack_gpu_kernel_&
155 : &MATH_DATATYPE&
156 : &_&
157 : &PRECISION &
158 : ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
159 0 : row_group_dev,a_dev)
160 :
161 0 : end subroutine
162 :
163 : ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
164 : ! occurs when the queue is full or when the next row belongs to another group
165 : subroutine unpack_and_prepare_row_group_&
166 : &MATH_DATATYPE&
167 : &_gpu_&
168 0 : &PRECISION &
169 0 : (row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
170 : last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
171 : unpack_idx, next_unpack_idx, force)
172 :
173 : use iso_c_binding
174 : use precision
175 : implicit none
176 : #if REALCASE == 1
177 : real(kind=C_DATATYPE_KIND) :: row_group(:,:)
178 : #endif
179 : #if COMPLEXCASE == 1
180 : complex(kind=C_DATATYPE_KIND) :: row_group(:,:)
181 : #endif
182 : integer(kind=c_intptr_t) :: row_group_dev, a_dev
183 : integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
184 : integer(kind=ik), intent(inout) :: row_group_size
185 : integer(kind=ik), intent(in) :: nblk
186 : integer(kind=ik), intent(inout) :: unpack_idx
187 : integer(kind=ik), intent(in) :: next_unpack_idx
188 : logical, intent(in) :: force
189 :
190 0 : if (row_group_size == 0) then
191 : ! Nothing to flush, just prepare for the upcoming row
192 0 : row_group_size = 1
193 : else
194 0 : if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
195 : ! A flush and a reset must be performed
196 : call unpack_row_group_&
197 : &MATH_DATATYPE&
198 : &_gpu_&
199 : &PRECISION&
200 : (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
201 0 : a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
202 0 : row_group_size = 1
203 : else
204 : ! Just prepare for the upcoming row
205 0 : row_group_size = row_group_size + 1
206 : endif
207 : endif
208 : ! Always update the index for the upcoming row
209 0 : unpack_idx = next_unpack_idx
210 0 : end subroutine
211 :
212 : ! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
213 : subroutine compute_hh_dot_products_&
214 : &MATH_DATATYPE&
215 : &_gpu_&
216 0 : &PRECISION&
217 : & (bcast_buffer_dev, hh_dot_dev, nbw, n)
218 : use cuda_c_kernel
219 : use precision
220 : use iso_c_binding
221 : implicit none
222 : integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_dot_dev
223 : integer(kind=ik), value :: nbw, n
224 :
225 0 : if (n .le. 1) return
226 : call launch_compute_hh_dotp_gpu_kernel_&
227 : &MATH_DATATYPE&
228 : &_&
229 : &PRECISION&
230 0 : & ( bcast_buffer_dev, hh_dot_dev, nbw, n)
231 :
232 :
233 : end subroutine
234 :
235 : ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
236 : subroutine extract_hh_tau_&
237 : &MATH_DATATYPE&
238 : &_gpu_&
239 0 : &PRECISION&
240 : & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
241 : use cuda_c_kernel
242 : use precision
243 : use iso_c_binding
244 : implicit none
245 : integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
246 : integer(kind=ik), value :: nbw, n
247 : logical, value :: is_zero
248 : integer(kind=ik) :: val_is_zero
249 0 : if (is_zero) then
250 0 : val_is_zero = 1
251 : else
252 0 : val_is_zero = 0
253 : endif
254 :
255 : call launch_extract_hh_tau_gpu_kernel_&
256 : &MATH_DATATYPE&
257 : &_&
258 : &PRECISION&
259 0 : & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
260 0 : end subroutine
261 :
262 : ! -------------------------------------------
263 : ! Fortran back-transformation support kernels
264 : ! -------------------------------------------
265 :
266 : ! Reset a reduction block
267 : ! Limitation: the thread-block size must be a divider of the reduction block's size
268 : ! Reset 2 reduction blocks without an explicit synchronization at the end
269 : ! Limitation: : the thread-block size must be a divider of the reduction block's size
270 : ! Perform a reduction on an initialized, 128-element shared block
271 : ! Compute the dot-product between 2 consecutive HH vectors
272 : ! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
273 : ! Limitation 2: the size of the warp must be equal to 32
274 : !
275 : ! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
276 : ! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
277 : !
278 : ! -------------------------------------------
279 : ! Fortran back-transformation support kernels
280 : ! -------------------------------------------
281 : !
282 : ! This is the simplest and slowest available backtransformation kernel
283 : !
284 : ! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
285 : ! 2 Householder reflectors per iteration
286 : !
287 : ! ---------------------------------
288 : ! Row packing and unpacking kernels
289 : ! ---------------------------------
290 : !
291 : ! The row group packing kernel
292 :
293 : ! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
294 : ! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
295 : ! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
296 : ! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
297 :
|