1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
31 template<
typename StringT>
34 source.append(
"__kernel void cg_vector_update( \n");
35 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
36 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
37 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
38 source.append(
" __global "); source.append(numeric_string); source.append(
" * r, \n");
39 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
40 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
41 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
42 source.append(
" unsigned int size, \n");
43 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
44 source.append(
"{ \n");
45 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
46 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
47 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
48 source.append(
" "); source.append(numeric_string); source.append(
" value_r = r[i]; \n");
50 source.append(
" result[i] += alpha * value_p; \n");
51 source.append(
" value_r -= alpha * Ap[i]; \n");
52 source.append(
" value_p = value_r + beta * value_p; \n");
54 source.append(
" p[i] = value_p; \n");
55 source.append(
" r[i] = value_r; \n");
56 source.append(
" inner_prod_contrib += value_r * value_r; \n");
57 source.append(
" } \n");
60 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
61 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
62 source.append(
" { \n");
63 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
64 source.append(
" if (get_local_id(0) < stride) \n");
65 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
69 source.append(
" if (get_local_id(0) == 0) \n ");
70 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
72 source.append(
"} \n");
75 template<
typename StringT>
78 source.append(
"__kernel void cg_csr_prod( \n");
79 source.append(
" __global const unsigned int * row_indices, \n");
80 source.append(
" __global const unsigned int * column_indices, \n");
81 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
82 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
83 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
84 source.append(
" unsigned int size, \n");
85 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
86 source.append(
" unsigned int buffer_size, \n");
87 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
88 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
89 source.append(
"{ \n");
90 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
91 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
92 source.append(
" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
93 source.append(
" { \n");
94 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = ("); source.append(numeric_string); source.append(
")0; \n");
95 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
96 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
97 source.append(
" dot_prod += elements[i] * p[column_indices[i]]; \n");
98 source.append(
" Ap[row] = dot_prod; \n");
99 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
100 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
101 source.append(
" } \n");
104 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
105 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
106 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
107 source.append(
" { \n");
108 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
109 source.append(
" if (get_local_id(0) < stride) { \n");
110 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
111 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
112 source.append(
" } ");
113 source.append(
" } ");
116 source.append(
" if (get_local_id(0) == 0) { \n ");
117 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
118 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
119 source.append(
" } \n");
121 source.append(
"} \n \n");
125 template<
typename StringT>
128 source.append(
"__kernel void cg_coo_prod( \n");
129 source.append(
" __global const uint2 * coords, \n");
130 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
131 source.append(
" __global const uint * group_boundaries, \n");
132 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
133 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
134 source.append(
" unsigned int size, \n");
135 source.append(
" __local unsigned int * shared_rows, \n");
136 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
137 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
138 source.append(
" unsigned int buffer_size, \n");
139 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
140 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
141 source.append(
"{ \n");
142 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
143 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
146 source.append(
" uint2 tmp; \n");
147 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
148 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
149 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
150 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
152 source.append(
" uint local_index = 0; \n");
154 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
155 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
157 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
158 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
161 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
162 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
163 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
164 source.append(
" else {\n");
165 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
166 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
167 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
168 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
169 source.append(
" } \n");
170 source.append(
" } \n");
173 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
174 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
175 source.append(
" inter_results[get_local_id(0)] = val; \n");
176 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
177 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
179 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
180 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
181 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
182 source.append(
" inter_results[get_local_id(0)] += left; \n");
183 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
184 source.append(
" } \n");
187 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
188 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
189 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
190 source.append(
" Ap[tmp.x] = Ap_entry; \n");
191 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
192 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
193 source.append(
" } \n");
195 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
196 source.append(
" } \n");
198 source.append(
" if (local_index + 1 == group_end) {\n");
199 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
200 source.append(
" Ap[tmp.x] = Ap_entry; \n");
201 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
202 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
203 source.append(
" } \n");
206 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
207 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
208 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
209 source.append(
" { \n");
210 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
211 source.append(
" if (get_local_id(0) < stride) { \n");
212 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
213 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
214 source.append(
" } ");
215 source.append(
" } ");
218 source.append(
" if (get_local_id(0) == 0) { \n ");
219 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
220 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
221 source.append(
" } \n");
223 source.append(
"} \n \n");
228 template<
typename StringT>
231 source.append(
"__kernel void cg_ell_prod( \n");
232 source.append(
" __global const unsigned int * coords, \n");
233 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
234 source.append(
" unsigned int internal_row_num, \n");
235 source.append(
" unsigned int items_per_row, \n");
236 source.append(
" unsigned int aligned_items_per_row, \n");
237 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
238 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
239 source.append(
" unsigned int size, \n");
240 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
241 source.append(
" unsigned int buffer_size, \n");
242 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
243 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
244 source.append(
"{ \n");
245 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
246 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
247 source.append(
" uint glb_id = get_global_id(0); \n");
248 source.append(
" uint glb_sz = get_global_size(0); \n");
250 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
251 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
253 source.append(
" uint offset = row; \n");
254 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
255 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
256 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
257 source.append(
" } \n");
259 source.append(
" Ap[row] = sum; \n");
260 source.append(
" inner_prod_ApAp += sum * sum; \n");
261 source.append(
" inner_prod_pAp += p[row] * sum; \n");
262 source.append(
" } \n");
265 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
266 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
267 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
268 source.append(
" { \n");
269 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
270 source.append(
" if (get_local_id(0) < stride) { \n");
271 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
272 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
273 source.append(
" } ");
274 source.append(
" } ");
277 source.append(
" if (get_local_id(0) == 0) { \n ");
278 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
279 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
280 source.append(
" } \n");
281 source.append(
"} \n \n");
284 template<
typename StringT>
287 source.append(
"__kernel void cg_sliced_ell_prod( \n");
288 source.append(
" __global const unsigned int * columns_per_block, \n");
289 source.append(
" __global const unsigned int * column_indices, \n");
290 source.append(
" __global const unsigned int * block_start, \n");
291 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
292 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
293 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
294 source.append(
" unsigned int size, \n");
295 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
296 source.append(
" unsigned int buffer_size, \n");
297 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
298 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
299 source.append(
"{ \n");
300 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
301 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
302 source.append(
" uint local_id = get_local_id(0); \n");
303 source.append(
" uint local_size = get_local_size(0); \n");
305 source.append(
" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
306 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
308 source.append(
" uint row = block_idx * local_size + local_id; \n");
309 source.append(
" uint offset = block_start[block_idx]; \n");
310 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
311 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
312 source.append(
" uint index = offset + item_id * local_size + local_id; \n");
313 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
314 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
315 source.append(
" } \n");
317 source.append(
" if (row < size) {\n");
318 source.append(
" Ap[row] = sum; \n");
319 source.append(
" inner_prod_ApAp += sum * sum; \n");
320 source.append(
" inner_prod_pAp += p[row] * sum; \n");
321 source.append(
" } \n");
322 source.append(
" } \n");
325 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
326 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
327 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
328 source.append(
" { \n");
329 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
330 source.append(
" if (get_local_id(0) < stride) { \n");
331 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
332 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
333 source.append(
" } ");
334 source.append(
" } ");
337 source.append(
" if (get_local_id(0) == 0) { \n ");
338 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
339 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
340 source.append(
" } \n");
341 source.append(
"} \n \n");
344 template<
typename StringT>
347 source.append(
"__kernel void cg_hyb_prod( \n");
348 source.append(
" const __global int* ell_coords, \n");
349 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
350 source.append(
" const __global uint* csr_rows, \n");
351 source.append(
" const __global uint* csr_cols, \n");
352 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
353 source.append(
" unsigned int internal_row_num, \n");
354 source.append(
" unsigned int items_per_row, \n");
355 source.append(
" unsigned int aligned_items_per_row, \n");
356 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
357 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
358 source.append(
" unsigned int size, \n");
359 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
360 source.append(
" unsigned int buffer_size, \n");
361 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
362 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
363 source.append(
"{ \n");
364 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
365 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
366 source.append(
" uint glb_id = get_global_id(0); \n");
367 source.append(
" uint glb_sz = get_global_size(0); \n");
369 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
370 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
372 source.append(
" uint offset = row; \n");
373 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
374 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
375 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
376 source.append(
" } \n");
378 source.append(
" uint col_begin = csr_rows[row]; \n");
379 source.append(
" uint col_end = csr_rows[row + 1]; \n");
381 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
382 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
383 source.append(
" } \n");
385 source.append(
" Ap[row] = sum; \n");
386 source.append(
" inner_prod_ApAp += sum * sum; \n");
387 source.append(
" inner_prod_pAp += p[row] * sum; \n");
388 source.append(
" } \n");
391 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
392 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
393 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
394 source.append(
" { \n");
395 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
396 source.append(
" if (get_local_id(0) < stride) { \n");
397 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
398 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
399 source.append(
" } ");
400 source.append(
" } ");
403 source.append(
" if (get_local_id(0) == 0) { \n ");
404 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
405 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
406 source.append(
" } \n");
407 source.append(
"} \n \n");
414 template<
typename StringT>
417 source.append(
"__kernel void bicgstab_update_s( \n");
418 source.append(
" __global "); source.append(numeric_string); source.append(
" * s, \n");
419 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r, \n");
420 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
421 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
422 source.append(
" unsigned int chunk_size, \n");
423 source.append(
" unsigned int chunk_offset, \n");
424 source.append(
" unsigned int size, \n");
425 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array, \n");
426 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_Ap_in_r0) \n");
427 source.append(
"{ \n");
429 source.append(
" "); source.append(numeric_string); source.append(
" alpha = 0; \n");
432 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
433 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
434 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
435 source.append(
" { \n");
436 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
437 source.append(
" if (get_local_id(0) < stride) { \n");
438 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
439 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
440 source.append(
" } ");
441 source.append(
" } ");
444 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
445 source.append(
" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
447 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
448 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
449 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
450 source.append(
" \n");
451 source.append(
" value_s = r[i] - alpha * Ap[i]; \n");
452 source.append(
" inner_prod_contrib += value_s * value_s; \n");
453 source.append(
" \n");
454 source.append(
" s[i] = value_s; \n");
455 source.append(
" } \n");
456 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
459 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
460 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
461 source.append(
" { \n");
462 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
463 source.append(
" if (get_local_id(0) < stride) \n");
464 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
465 source.append(
" } ");
468 source.append(
" if (get_local_id(0) == 0) \n ");
469 source.append(
" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
471 source.append(
"} \n");
477 template<
typename StringT>
480 source.append(
"__kernel void bicgstab_vector_update( \n");
481 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
482 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
483 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
484 source.append(
" "); source.append(numeric_string); source.append(
" omega, \n");
485 source.append(
" __global "); source.append(numeric_string); source.append(
" const * s, \n");
486 source.append(
" __global "); source.append(numeric_string); source.append(
" * residual, \n");
487 source.append(
" __global "); source.append(numeric_string); source.append(
" const * As, \n");
488 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
489 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
490 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r0star, \n");
491 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
492 source.append(
" unsigned int size, \n");
493 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
494 source.append(
"{ \n");
495 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r_r0star = 0; \n");
496 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
497 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i]; \n");
498 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
499 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
500 source.append(
" "); source.append(numeric_string); source.append(
" value_residual = residual[i]; \n");
501 source.append(
" "); source.append(numeric_string); source.append(
" value_As = As[i]; \n");
502 source.append(
" "); source.append(numeric_string); source.append(
" value_Ap = Ap[i]; \n");
503 source.append(
" "); source.append(numeric_string); source.append(
" value_r0star = r0star[i]; \n");
504 source.append(
" \n");
505 source.append(
" value_result += alpha * value_p + omega * value_s; \n");
506 source.append(
" value_residual = value_s - omega * value_As; \n");
507 source.append(
" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
508 source.append(
" \n");
509 source.append(
" result[i] = value_result; \n");
510 source.append(
" residual[i] = value_residual; \n");
511 source.append(
" p[i] = value_p; \n");
512 source.append(
" inner_prod_r_r0star += value_residual * value_r0star; \n");
513 source.append(
" } \n");
516 source.append(
" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
517 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
518 source.append(
" { \n");
519 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
520 source.append(
" if (get_local_id(0) < stride) \n");
521 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
522 source.append(
" } ");
525 source.append(
" if (get_local_id(0) == 0) \n ");
526 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
528 source.append(
"} \n");
532 template<
typename StringT>
535 source.append(
"__kernel void bicgstab_csr_prod( \n");
536 source.append(
" __global const unsigned int * row_indices, \n");
537 source.append(
" __global const unsigned int * column_indices, \n");
538 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
539 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
540 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
541 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
542 source.append(
" unsigned int size, \n");
543 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
544 source.append(
" unsigned int buffer_size, \n");
545 source.append(
" unsigned int buffer_offset, \n");
546 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
547 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
548 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
549 source.append(
"{ \n");
550 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
551 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
552 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
553 source.append(
" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
554 source.append(
" { \n");
555 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = ("); source.append(numeric_string); source.append(
")0; \n");
556 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
557 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
558 source.append(
" dot_prod += elements[i] * p[column_indices[i]]; \n");
559 source.append(
" Ap[row] = dot_prod; \n");
560 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
561 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
562 source.append(
" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
563 source.append(
" } \n");
566 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
567 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
568 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
569 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
570 source.append(
" { \n");
571 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
572 source.append(
" if (get_local_id(0) < stride) { \n");
573 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
574 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
575 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
576 source.append(
" } ");
577 source.append(
" } ");
580 source.append(
" if (get_local_id(0) == 0) { \n ");
581 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
582 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
583 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
584 source.append(
" } \n");
586 source.append(
"} \n \n");
590 template<
typename StringT>
593 source.append(
"__kernel void bicgstab_coo_prod( \n");
594 source.append(
" __global const uint2 * coords, \n");
595 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
596 source.append(
" __global const uint * group_boundaries, \n");
597 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
598 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
599 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
600 source.append(
" unsigned int size, \n");
601 source.append(
" __local unsigned int * shared_rows, \n");
602 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
603 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
604 source.append(
" unsigned int buffer_size, \n");
605 source.append(
" unsigned int buffer_offset, \n");
606 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
607 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
608 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
609 source.append(
"{ \n");
610 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
611 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
612 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
615 source.append(
" uint2 tmp; \n");
616 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
617 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
618 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
619 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
621 source.append(
" uint local_index = 0; \n");
623 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
624 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
626 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
627 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
630 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
631 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
632 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
633 source.append(
" else {\n");
634 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
635 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
636 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
637 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
638 source.append(
" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
639 source.append(
" } \n");
640 source.append(
" } \n");
643 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
644 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
645 source.append(
" inter_results[get_local_id(0)] = val; \n");
646 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
647 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
649 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
650 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
651 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
652 source.append(
" inter_results[get_local_id(0)] += left; \n");
653 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
654 source.append(
" } \n");
657 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
658 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
659 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
660 source.append(
" Ap[tmp.x] = Ap_entry; \n");
661 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
662 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
663 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
664 source.append(
" } \n");
666 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
667 source.append(
" } \n");
669 source.append(
" if (local_index + 1 == group_end) {\n");
670 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
671 source.append(
" Ap[tmp.x] = Ap_entry; \n");
672 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
673 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
674 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
675 source.append(
" } \n");
678 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
679 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
680 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
681 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
682 source.append(
" { \n");
683 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
684 source.append(
" if (get_local_id(0) < stride) { \n");
685 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
686 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
687 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
688 source.append(
" } ");
689 source.append(
" } ");
692 source.append(
" if (get_local_id(0) == 0) { \n ");
693 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
694 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
695 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
696 source.append(
" } \n");
698 source.append(
"} \n \n");
703 template<
typename StringT>
706 source.append(
"__kernel void bicgstab_ell_prod( \n");
707 source.append(
" __global const unsigned int * coords, \n");
708 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
709 source.append(
" unsigned int internal_row_num, \n");
710 source.append(
" unsigned int items_per_row, \n");
711 source.append(
" unsigned int aligned_items_per_row, \n");
712 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
713 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
714 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
715 source.append(
" unsigned int size, \n");
716 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
717 source.append(
" unsigned int buffer_size, \n");
718 source.append(
" unsigned int buffer_offset, \n");
719 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
720 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
721 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
722 source.append(
"{ \n");
723 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
724 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
725 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
726 source.append(
" uint glb_id = get_global_id(0); \n");
727 source.append(
" uint glb_sz = get_global_size(0); \n");
729 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
730 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
732 source.append(
" uint offset = row; \n");
733 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
734 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
735 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
736 source.append(
" } \n");
738 source.append(
" Ap[row] = sum; \n");
739 source.append(
" inner_prod_ApAp += sum * sum; \n");
740 source.append(
" inner_prod_pAp += p[row] * sum; \n");
741 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
742 source.append(
" } \n");
745 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
746 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
747 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
748 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
749 source.append(
" { \n");
750 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
751 source.append(
" if (get_local_id(0) < stride) { \n");
752 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
753 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
754 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
755 source.append(
" } ");
756 source.append(
" } ");
759 source.append(
" if (get_local_id(0) == 0) { \n ");
760 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
761 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
762 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
763 source.append(
" } \n");
764 source.append(
"} \n \n");
767 template<
typename StringT>
770 source.append(
"__kernel void bicgstab_sliced_ell_prod( \n");
771 source.append(
" __global const unsigned int * columns_per_block, \n");
772 source.append(
" __global const unsigned int * column_indices, \n");
773 source.append(
" __global const unsigned int * block_start, \n");
774 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
775 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
776 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
777 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
778 source.append(
" unsigned int size, \n");
779 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
780 source.append(
" unsigned int buffer_size, \n");
781 source.append(
" unsigned int buffer_offset, \n");
782 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
783 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
784 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
785 source.append(
"{ \n");
786 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
787 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
788 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
789 source.append(
" uint local_id = get_local_id(0); \n");
790 source.append(
" uint local_size = get_local_size(0); \n");
792 source.append(
" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
793 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
795 source.append(
" uint row = block_idx * local_size + local_id; \n");
796 source.append(
" uint offset = block_start[block_idx]; \n");
797 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
798 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
799 source.append(
" uint index = offset + item_id * local_size + local_id; \n");
800 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
801 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
802 source.append(
" } \n");
804 source.append(
" if (row < size) {\n");
805 source.append(
" Ap[row] = sum; \n");
806 source.append(
" inner_prod_ApAp += sum * sum; \n");
807 source.append(
" inner_prod_pAp += p[row] * sum; \n");
808 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
809 source.append(
" } \n");
810 source.append(
" } \n");
813 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
814 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
815 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
816 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
817 source.append(
" { \n");
818 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
819 source.append(
" if (get_local_id(0) < stride) { \n");
820 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
821 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
822 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
823 source.append(
" } ");
824 source.append(
" } ");
827 source.append(
" if (get_local_id(0) == 0) { \n ");
828 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
829 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
830 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
831 source.append(
" } \n");
832 source.append(
"} \n \n");
835 template<
typename StringT>
838 source.append(
"__kernel void bicgstab_hyb_prod( \n");
839 source.append(
" const __global int* ell_coords, \n");
840 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
841 source.append(
" const __global uint* csr_rows, \n");
842 source.append(
" const __global uint* csr_cols, \n");
843 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
844 source.append(
" unsigned int internal_row_num, \n");
845 source.append(
" unsigned int items_per_row, \n");
846 source.append(
" unsigned int aligned_items_per_row, \n");
847 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
848 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
849 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
850 source.append(
" unsigned int size, \n");
851 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
852 source.append(
" unsigned int buffer_size, \n");
853 source.append(
" unsigned int buffer_offset, \n");
854 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
855 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
856 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
857 source.append(
"{ \n");
858 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
859 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
860 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
861 source.append(
" uint glb_id = get_global_id(0); \n");
862 source.append(
" uint glb_sz = get_global_size(0); \n");
864 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
865 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
867 source.append(
" uint offset = row; \n");
868 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
869 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
870 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
871 source.append(
" } \n");
873 source.append(
" uint col_begin = csr_rows[row]; \n");
874 source.append(
" uint col_end = csr_rows[row + 1]; \n");
876 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
877 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
878 source.append(
" } \n");
880 source.append(
" Ap[row] = sum; \n");
881 source.append(
" inner_prod_ApAp += sum * sum; \n");
882 source.append(
" inner_prod_pAp += p[row] * sum; \n");
883 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
884 source.append(
" } \n");
887 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
888 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
889 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
890 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
891 source.append(
" { \n");
892 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
893 source.append(
" if (get_local_id(0) < stride) { \n");
894 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
895 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
896 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
897 source.append(
" } ");
898 source.append(
" } ");
901 source.append(
" if (get_local_id(0) == 0) { \n ");
902 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
903 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
904 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
905 source.append(
" } \n");
906 source.append(
"} \n \n");
912 template <
typename StringType>
915 source.append(
"__kernel void gmres_gram_schmidt_1( \n");
916 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
917 source.append(
" unsigned int size, \n");
918 source.append(
" unsigned int internal_size, \n");
919 source.append(
" unsigned int k, \n");
920 source.append(
" __global "); source.append(numeric_string); source.append(
" * vi_in_vk_buffer, \n");
921 source.append(
" unsigned int chunk_size, \n");
922 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
923 source.append(
"{ \n");
925 source.append(
" "); source.append(numeric_string); source.append(
" vi_in_vk[7]; \n");
926 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
928 source.append(
" unsigned int k_base = 0; \n");
929 source.append(
" while (k_base < k) { \n");
930 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
931 source.append(
" vi_in_vk[0] = 0;\n");
932 source.append(
" vi_in_vk[1] = 0;\n");
933 source.append(
" vi_in_vk[2] = 0;\n");
934 source.append(
" vi_in_vk[3] = 0;\n");
935 source.append(
" vi_in_vk[4] = 0;\n");
936 source.append(
" vi_in_vk[5] = 0;\n");
937 source.append(
" vi_in_vk[6] = 0;\n");
938 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
939 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
940 source.append(
" \n");
941 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
942 source.append(
" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
943 source.append(
" } \n");
946 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
947 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
948 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
949 source.append(
" { \n");
950 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
951 source.append(
" if (get_local_id(0) < stride) { \n");
952 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
953 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
954 source.append(
" } ");
955 source.append(
" } ");
958 source.append(
" if (get_local_id(0) == 0) \n ");
959 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
960 source.append(
" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
962 source.append(
" k_base += vecs_in_iteration; \n");
963 source.append(
" } \n");
965 source.append(
"} \n");
969 template <
typename StringType>
972 source.append(
"__kernel void gmres_gram_schmidt_2( \n");
973 source.append(
" __global "); source.append(numeric_string); source.append(
" * krylov_basis, \n");
974 source.append(
" unsigned int size, \n");
975 source.append(
" unsigned int internal_size, \n");
976 source.append(
" unsigned int k, \n");
977 source.append(
" __global "); source.append(numeric_string); source.append(
" const * vi_in_vk_buffer, \n");
978 source.append(
" unsigned int chunk_size, \n");
979 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
980 source.append(
" unsigned int krylov_dim, \n");
981 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
982 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
983 source.append(
"{ \n");
985 source.append(
" "); source.append(numeric_string); source.append(
" vk_dot_vk = 0; \n");
986 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
988 source.append(
" unsigned int k_base = 0; \n");
989 source.append(
" while (k_base < k) { \n");
990 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
993 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
994 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
995 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
996 source.append(
" { \n");
997 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
998 source.append(
" if (get_local_id(0) < stride) { \n");
999 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1000 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1001 source.append(
" } ");
1002 source.append(
" } ");
1003 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1006 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1007 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
1008 source.append(
" \n");
1009 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1010 source.append(
" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1011 source.append(
" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1012 source.append(
" krylov_basis[i + k * internal_size] = value_vk; \n");
1013 source.append(
" } \n");
1016 source.append(
" if (get_group_id(0) == 0) \n");
1017 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1018 source.append(
" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1019 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1021 source.append(
" k_base += vecs_in_iteration; \n");
1022 source.append(
" } \n");
1025 source.append(
" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1026 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1027 source.append(
" { \n");
1028 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1029 source.append(
" if (get_local_id(0) < stride) \n");
1030 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1031 source.append(
" } ");
1034 source.append(
" if (get_local_id(0) == 0) \n ");
1035 source.append(
" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1037 source.append(
"} \n");
1040 template <
typename StringType>
1043 source.append(
"__kernel void gmres_normalize_vk( \n");
1044 source.append(
" __global "); source.append(numeric_string); source.append(
" * vk, \n");
1045 source.append(
" unsigned int vk_offset, \n");
1046 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1047 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
1048 source.append(
" unsigned int R_offset, \n");
1049 source.append(
" __global "); source.append(numeric_string); source.append(
" const * inner_prod_buffer, \n");
1050 source.append(
" unsigned int chunk_size, \n");
1051 source.append(
" __global "); source.append(numeric_string); source.append(
" * r_dot_vk_buffer, \n");
1052 source.append(
" unsigned int chunk_offset, \n");
1053 source.append(
" unsigned int size, \n");
1054 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
1055 source.append(
"{ \n");
1057 source.append(
" "); source.append(numeric_string); source.append(
" norm_vk = 0; \n");
1060 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1061 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1062 source.append(
" { \n");
1063 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1064 source.append(
" if (get_local_id(0) < stride) \n");
1065 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1066 source.append(
" } ");
1069 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1070 source.append(
" norm_vk = sqrt(shared_array[0]); \n");
1072 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
1073 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1074 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = vk[i + vk_offset] / norm_vk; \n");
1075 source.append(
" \n");
1076 source.append(
" inner_prod_contrib += residual[i] * value_vk; \n");
1077 source.append(
" \n");
1078 source.append(
" vk[i + vk_offset] = value_vk; \n");
1079 source.append(
" } \n");
1080 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1083 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1084 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1085 source.append(
" { \n");
1086 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1087 source.append(
" if (get_local_id(0) < stride) \n");
1088 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1089 source.append(
" } ");
1092 source.append(
" if (get_local_id(0) == 0) \n ");
1093 source.append(
" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1094 source.append(
" if (get_global_id(0) == 0) \n ");
1095 source.append(
" R_buffer[R_offset] = norm_vk; \n");
1097 source.append(
"} \n");
1101 template <
typename StringType>
1104 source.append(
"__kernel void gmres_update_result( \n");
1105 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1106 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1107 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
1108 source.append(
" unsigned int size, \n");
1109 source.append(
" unsigned int internal_size, \n");
1110 source.append(
" __global "); source.append(numeric_string); source.append(
" const * coefficients, \n");
1111 source.append(
" unsigned int k) \n");
1112 source.append(
"{ \n");
1114 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1115 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i] + coefficients[0] * residual[i]; \n");
1116 source.append(
" \n");
1117 source.append(
" for (unsigned int j = 1; j < k; ++j) \n");
1118 source.append(
" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1119 source.append(
" \n");
1120 source.append(
" result[i] = value_result; \n");
1121 source.append(
" } \n");
1123 source.append(
"} \n");
1127 template <
typename StringType>
1130 source.append(
"__kernel void gmres_csr_prod( \n");
1131 source.append(
" __global const unsigned int * row_indices, \n");
1132 source.append(
" __global const unsigned int * column_indices, \n");
1133 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1134 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1135 source.append(
" unsigned int offset_p, \n");
1136 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1137 source.append(
" unsigned int offset_Ap, \n");
1138 source.append(
" unsigned int size, \n");
1139 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1140 source.append(
" unsigned int buffer_size, \n");
1141 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1142 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1143 source.append(
"{ \n");
1144 source.append(
" cg_csr_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1145 source.append(
"} \n \n");
1149 template <
typename StringType>
1152 source.append(
"__kernel void gmres_coo_prod( \n");
1153 source.append(
" __global const uint2 * coords, \n");
1154 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1155 source.append(
" __global const uint * group_boundaries, \n");
1156 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1157 source.append(
" unsigned int offset_p, \n");
1158 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1159 source.append(
" unsigned int offset_Ap, \n");
1160 source.append(
" unsigned int size, \n");
1161 source.append(
" __local unsigned int * shared_rows, \n");
1162 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
1163 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1164 source.append(
" unsigned int buffer_size, \n");
1165 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1166 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1167 source.append(
"{ \n");
1168 source.append(
" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1169 source.append(
"} \n \n");
1174 template <
typename StringType>
1177 source.append(
"__kernel void gmres_ell_prod( \n");
1178 source.append(
" __global const unsigned int * coords, \n");
1179 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1180 source.append(
" unsigned int internal_row_num, \n");
1181 source.append(
" unsigned int items_per_row, \n");
1182 source.append(
" unsigned int aligned_items_per_row, \n");
1183 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1184 source.append(
" unsigned int offset_p, \n");
1185 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1186 source.append(
" unsigned int offset_Ap, \n");
1187 source.append(
" unsigned int size, \n");
1188 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1189 source.append(
" unsigned int buffer_size, \n");
1190 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1191 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1192 source.append(
"{ \n");
1193 source.append(
" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1194 source.append(
"} \n \n");
1197 template <
typename StringType>
1200 source.append(
"__kernel void gmres_sliced_ell_prod( \n");
1201 source.append(
" __global const unsigned int * columns_per_block, \n");
1202 source.append(
" __global const unsigned int * column_indices, \n");
1203 source.append(
" __global const unsigned int * block_start, \n");
1204 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1205 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1206 source.append(
" unsigned int offset_p, \n");
1207 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1208 source.append(
" unsigned int offset_Ap, \n");
1209 source.append(
" unsigned int size, \n");
1210 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1211 source.append(
" unsigned int buffer_size, \n");
1212 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1213 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1214 source.append(
"{ \n");
1215 source.append(
" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1216 source.append(
"} \n \n");
1219 template <
typename StringType>
1222 source.append(
"__kernel void gmres_hyb_prod( \n");
1223 source.append(
" const __global int* ell_coords, \n");
1224 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
1225 source.append(
" const __global uint* csr_rows, \n");
1226 source.append(
" const __global uint* csr_cols, \n");
1227 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
1228 source.append(
" unsigned int internal_row_num, \n");
1229 source.append(
" unsigned int items_per_row, \n");
1230 source.append(
" unsigned int aligned_items_per_row, \n");
1231 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1232 source.append(
" unsigned int offset_p, \n");
1233 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1234 source.append(
" unsigned int offset_Ap, \n");
1235 source.append(
" unsigned int size, \n");
1236 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1237 source.append(
" unsigned int buffer_size, \n");
1238 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1239 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1240 source.append(
"{ \n");
1241 source.append(
" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1242 source.append(
"} \n \n");
1252 template<
typename NumericT>
1262 static std::map<cl_context, bool> init_done;
1269 source.reserve(1024);
1271 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1299 #ifdef VIENNACL_BUILD_INFO
1300 std::cout <<
"Creating program " << prog_name << std::endl;
1302 ctx.add_program(source, prog_name);
1303 init_done[ctx.handle().get()] =
true;
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
static void apply(viennacl::ocl::context const &)
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
const OCL_TYPE & get() const
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)