ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
3 
5 
7 
11 
12 #include "viennacl/ocl/kernel.hpp"
14 #include "viennacl/ocl/utils.hpp"
15 
18 
21 namespace viennacl
22 {
23 namespace linalg
24 {
25 namespace opencl
26 {
27 namespace kernels
28 {
30 
31 template<typename StringT>
32 void generate_pipelined_cg_vector_update(StringT & source, std::string const & numeric_string)
33 {
34  source.append("__kernel void cg_vector_update( \n");
35  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
36  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
37  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
38  source.append(" __global "); source.append(numeric_string); source.append(" * r, \n");
39  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
40  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
41  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
42  source.append(" unsigned int size, \n");
43  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
44  source.append("{ \n");
45  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
46  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
47  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
48  source.append(" "); source.append(numeric_string); source.append(" value_r = r[i]; \n");
49  source.append(" \n");
50  source.append(" result[i] += alpha * value_p; \n");
51  source.append(" value_r -= alpha * Ap[i]; \n");
52  source.append(" value_p = value_r + beta * value_p; \n");
53  source.append(" \n");
54  source.append(" p[i] = value_p; \n");
55  source.append(" r[i] = value_r; \n");
56  source.append(" inner_prod_contrib += value_r * value_r; \n");
57  source.append(" } \n");
58 
59  // parallel reduction in work group
60  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
61  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
62  source.append(" { \n");
63  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
64  source.append(" if (get_local_id(0) < stride) \n");
65  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
66  source.append(" } ");
67 
68  // write results to result array
69  source.append(" if (get_local_id(0) == 0) \n ");
70  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
71 
72  source.append("} \n");
73 }
74 
75 template<typename StringT>
76 void generate_compressed_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
77 {
78  source.append("__kernel void cg_csr_prod( \n");
79  source.append(" __global const unsigned int * row_indices, \n");
80  source.append(" __global const unsigned int * column_indices, \n");
81  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
82  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
83  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
84  source.append(" unsigned int size, \n");
85  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
86  source.append(" unsigned int buffer_size, \n");
87  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
88  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
89  source.append("{ \n");
90  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
91  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
92  source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
93  source.append(" { \n");
94  source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
95  source.append(" unsigned int row_end = row_indices[row+1]; \n");
96  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
97  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
98  source.append(" Ap[row] = dot_prod; \n");
99  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
100  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
101  source.append(" } \n");
102 
103  // parallel reduction in work group
104  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
105  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
106  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
107  source.append(" { \n");
108  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
109  source.append(" if (get_local_id(0) < stride) { \n");
110  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
111  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
112  source.append(" } ");
113  source.append(" } ");
114 
115  // write results to result array
116  source.append(" if (get_local_id(0) == 0) { \n ");
117  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
118  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
119  source.append(" } \n");
120 
121  source.append("} \n \n");
122 
123 }
124 
125 template<typename StringT>
126 void generate_coordinate_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
127 {
128  source.append("__kernel void cg_coo_prod( \n");
129  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
130  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
131  source.append(" __global const uint * group_boundaries, \n");
132  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
133  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
134  source.append(" unsigned int size, \n");
135  source.append(" __local unsigned int * shared_rows, \n");
136  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
137  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
138  source.append(" unsigned int buffer_size, \n");
139  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
140  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
141  source.append("{ \n");
142  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
143  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
144 
146  source.append(" uint2 tmp; \n");
147  source.append(" "); source.append(numeric_string); source.append(" val; \n");
148  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
149  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
150  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
151 
152  source.append(" uint local_index = 0; \n");
153 
154  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
155  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
156 
157  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
158  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
159 
160  //check for carry from previous loop run:
161  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
162  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
163  source.append(" val += inter_results[get_local_size(0)-1]; \n");
164  source.append(" else {\n");
165  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
166  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
167  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
168  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
169  source.append(" } \n");
170  source.append(" } \n");
171 
172  //segmented parallel reduction begin
173  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
174  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
175  source.append(" inter_results[get_local_id(0)] = val; \n");
176  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
177  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
178 
179  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
180  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
181  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
182  source.append(" inter_results[get_local_id(0)] += left; \n");
183  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
184  source.append(" } \n");
185  //segmented parallel reduction end
186 
187  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
188  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
189  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
190  source.append(" Ap[tmp.x] = Ap_entry; \n");
191  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
192  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
193  source.append(" } \n");
194 
195  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
196  source.append(" } \n"); //for k
197 
198  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
199  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
200  source.append(" Ap[tmp.x] = Ap_entry; \n");
201  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
202  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
203  source.append(" } \n");
204 
206  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
207  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
208  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
209  source.append(" { \n");
210  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
211  source.append(" if (get_local_id(0) < stride) { \n");
212  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
213  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
214  source.append(" } ");
215  source.append(" } ");
216 
217  // write results to result array
218  source.append(" if (get_local_id(0) == 0) { \n ");
219  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
220  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
221  source.append(" } \n");
222 
223  source.append("} \n \n");
224 
225 }
226 
227 
228 template<typename StringT>
229 void generate_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
230 {
231  source.append("__kernel void cg_ell_prod( \n");
232  source.append(" __global const unsigned int * coords, \n");
233  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
234  source.append(" unsigned int internal_row_num, \n");
235  source.append(" unsigned int items_per_row, \n");
236  source.append(" unsigned int aligned_items_per_row, \n");
237  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
238  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
239  source.append(" unsigned int size, \n");
240  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
241  source.append(" unsigned int buffer_size, \n");
242  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
243  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
244  source.append("{ \n");
245  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
246  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
247  source.append(" uint glb_id = get_global_id(0); \n");
248  source.append(" uint glb_sz = get_global_size(0); \n");
249 
250  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
251  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
252 
253  source.append(" uint offset = row; \n");
254  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
255  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
256  source.append(" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
257  source.append(" } \n");
258 
259  source.append(" Ap[row] = sum; \n");
260  source.append(" inner_prod_ApAp += sum * sum; \n");
261  source.append(" inner_prod_pAp += p[row] * sum; \n");
262  source.append(" } \n");
263 
265  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
266  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
267  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
268  source.append(" { \n");
269  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
270  source.append(" if (get_local_id(0) < stride) { \n");
271  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
272  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
273  source.append(" } ");
274  source.append(" } ");
275 
276  // write results to result array
277  source.append(" if (get_local_id(0) == 0) { \n ");
278  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
279  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
280  source.append(" } \n");
281  source.append("} \n \n");
282 }
283 
284 template<typename StringT>
285 void generate_sliced_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
286 {
287  source.append("__kernel void cg_sliced_ell_prod( \n");
288  source.append(" __global const unsigned int * columns_per_block, \n");
289  source.append(" __global const unsigned int * column_indices, \n");
290  source.append(" __global const unsigned int * block_start, \n");
291  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
292  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
293  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
294  source.append(" unsigned int size, \n");
295  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
296  source.append(" unsigned int buffer_size, \n");
297  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
298  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
299  source.append("{ \n");
300  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
301  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
302  source.append(" uint local_id = get_local_id(0); \n");
303  source.append(" uint local_size = get_local_size(0); \n");
304 
305  source.append(" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
306  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
307 
308  source.append(" uint row = block_idx * local_size + local_id; \n");
309  source.append(" uint offset = block_start[block_idx]; \n");
310  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
311  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
312  source.append(" uint index = offset + item_id * local_size + local_id; \n");
313  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
314  source.append(" sum += val ? (p[column_indices[index]] * val) : 0; \n");
315  source.append(" } \n");
316 
317  source.append(" if (row < size) {\n");
318  source.append(" Ap[row] = sum; \n");
319  source.append(" inner_prod_ApAp += sum * sum; \n");
320  source.append(" inner_prod_pAp += p[row] * sum; \n");
321  source.append(" } \n");
322  source.append(" } \n");
323 
325  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
326  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
327  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
328  source.append(" { \n");
329  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
330  source.append(" if (get_local_id(0) < stride) { \n");
331  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
332  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
333  source.append(" } ");
334  source.append(" } ");
335 
336  // write results to result array
337  source.append(" if (get_local_id(0) == 0) { \n ");
338  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
339  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
340  source.append(" } \n");
341  source.append("} \n \n");
342 }
343 
344 template<typename StringT>
345 void generate_hyb_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
346 {
347  source.append("__kernel void cg_hyb_prod( \n");
348  source.append(" const __global int* ell_coords, \n");
349  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
350  source.append(" const __global uint* csr_rows, \n");
351  source.append(" const __global uint* csr_cols, \n");
352  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
353  source.append(" unsigned int internal_row_num, \n");
354  source.append(" unsigned int items_per_row, \n");
355  source.append(" unsigned int aligned_items_per_row, \n");
356  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
357  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
358  source.append(" unsigned int size, \n");
359  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
360  source.append(" unsigned int buffer_size, \n");
361  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
362  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
363  source.append("{ \n");
364  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
365  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
366  source.append(" uint glb_id = get_global_id(0); \n");
367  source.append(" uint glb_sz = get_global_size(0); \n");
368 
369  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
370  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
371 
372  source.append(" uint offset = row; \n");
373  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
374  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
375  source.append(" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
376  source.append(" } \n");
377 
378  source.append(" uint col_begin = csr_rows[row]; \n");
379  source.append(" uint col_end = csr_rows[row + 1]; \n");
380 
381  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
382  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
383  source.append(" } \n");
384 
385  source.append(" Ap[row] = sum; \n");
386  source.append(" inner_prod_ApAp += sum * sum; \n");
387  source.append(" inner_prod_pAp += p[row] * sum; \n");
388  source.append(" } \n");
389 
391  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
392  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
393  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
394  source.append(" { \n");
395  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
396  source.append(" if (get_local_id(0) < stride) { \n");
397  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
398  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
399  source.append(" } ");
400  source.append(" } ");
401 
402  // write results to result array
403  source.append(" if (get_local_id(0) == 0) { \n ");
404  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
405  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
406  source.append(" } \n");
407  source.append("} \n \n");
408 }
409 
410 
412 
413 
414 template<typename StringT>
415 void generate_pipelined_bicgstab_update_s(StringT & source, std::string const & numeric_string)
416 {
417  source.append("__kernel void bicgstab_update_s( \n");
418  source.append(" __global "); source.append(numeric_string); source.append(" * s, \n");
419  source.append(" __global "); source.append(numeric_string); source.append(" const * r, \n");
420  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
421  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
422  source.append(" unsigned int chunk_size, \n");
423  source.append(" unsigned int chunk_offset, \n");
424  source.append(" unsigned int size, \n");
425  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array, \n");
426  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_Ap_in_r0) \n");
427  source.append("{ \n");
428 
429  source.append(" "); source.append(numeric_string); source.append(" alpha = 0; \n");
430 
431  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
432  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
433  source.append(" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
434  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
435  source.append(" { \n");
436  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
437  source.append(" if (get_local_id(0) < stride) { \n");
438  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
439  source.append(" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
440  source.append(" } ");
441  source.append(" } ");
442 
443  // compute alpha from reduced values:
444  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
445  source.append(" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
446 
447  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
448  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
449  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
450  source.append(" \n");
451  source.append(" value_s = r[i] - alpha * Ap[i]; \n");
452  source.append(" inner_prod_contrib += value_s * value_s; \n");
453  source.append(" \n");
454  source.append(" s[i] = value_s; \n");
455  source.append(" } \n");
456  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
457 
458  // parallel reduction in work group
459  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
460  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
461  source.append(" { \n");
462  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
463  source.append(" if (get_local_id(0) < stride) \n");
464  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
465  source.append(" } ");
466 
467  // write results to result array
468  source.append(" if (get_local_id(0) == 0) \n ");
469  source.append(" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
470 
471  source.append("} \n");
472 
473 }
474 
475 
476 
477 template<typename StringT>
478 void generate_pipelined_bicgstab_vector_update(StringT & source, std::string const & numeric_string)
479 {
480  source.append("__kernel void bicgstab_vector_update( \n");
481  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
482  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
483  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
484  source.append(" "); source.append(numeric_string); source.append(" omega, \n");
485  source.append(" __global "); source.append(numeric_string); source.append(" const * s, \n");
486  source.append(" __global "); source.append(numeric_string); source.append(" * residual, \n");
487  source.append(" __global "); source.append(numeric_string); source.append(" const * As, \n");
488  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
489  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
490  source.append(" __global "); source.append(numeric_string); source.append(" const * r0star, \n");
491  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
492  source.append(" unsigned int size, \n");
493  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
494  source.append("{ \n");
495  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r_r0star = 0; \n");
496  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
497  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i]; \n");
498  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
499  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
500  source.append(" "); source.append(numeric_string); source.append(" value_residual = residual[i]; \n");
501  source.append(" "); source.append(numeric_string); source.append(" value_As = As[i]; \n");
502  source.append(" "); source.append(numeric_string); source.append(" value_Ap = Ap[i]; \n");
503  source.append(" "); source.append(numeric_string); source.append(" value_r0star = r0star[i]; \n");
504  source.append(" \n");
505  source.append(" value_result += alpha * value_p + omega * value_s; \n");
506  source.append(" value_residual = value_s - omega * value_As; \n");
507  source.append(" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
508  source.append(" \n");
509  source.append(" result[i] = value_result; \n");
510  source.append(" residual[i] = value_residual; \n");
511  source.append(" p[i] = value_p; \n");
512  source.append(" inner_prod_r_r0star += value_residual * value_r0star; \n");
513  source.append(" } \n");
514 
515  // parallel reduction in work group
516  source.append(" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
517  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
518  source.append(" { \n");
519  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
520  source.append(" if (get_local_id(0) < stride) \n");
521  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
522  source.append(" } ");
523 
524  // write results to result array
525  source.append(" if (get_local_id(0) == 0) \n ");
526  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
527 
528  source.append("} \n");
529 }
530 
531 
532 template<typename StringT>
533 void generate_compressed_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
534 {
535  source.append("__kernel void bicgstab_csr_prod( \n");
536  source.append(" __global const unsigned int * row_indices, \n");
537  source.append(" __global const unsigned int * column_indices, \n");
538  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
539  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
540  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
541  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
542  source.append(" unsigned int size, \n");
543  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
544  source.append(" unsigned int buffer_size, \n");
545  source.append(" unsigned int buffer_offset, \n");
546  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
547  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
548  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
549  source.append("{ \n");
550  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
551  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
552  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
553  source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
554  source.append(" { \n");
555  source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
556  source.append(" unsigned int row_end = row_indices[row+1]; \n");
557  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
558  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
559  source.append(" Ap[row] = dot_prod; \n");
560  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
561  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
562  source.append(" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
563  source.append(" } \n");
564 
565  // parallel reduction in work group
566  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
567  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
568  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
569  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
570  source.append(" { \n");
571  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
572  source.append(" if (get_local_id(0) < stride) { \n");
573  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
574  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
575  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
576  source.append(" } ");
577  source.append(" } ");
578 
579  // write results to result array
580  source.append(" if (get_local_id(0) == 0) { \n ");
581  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
582  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
583  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
584  source.append(" } \n");
585 
586  source.append("} \n \n");
587 
588 }
589 
590 template<typename StringT>
591 void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
592 {
593  source.append("__kernel void bicgstab_coo_prod( \n");
594  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
595  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
596  source.append(" __global const uint * group_boundaries, \n");
597  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
598  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
599  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
600  source.append(" unsigned int size, \n");
601  source.append(" __local unsigned int * shared_rows, \n");
602  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
603  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
604  source.append(" unsigned int buffer_size, \n");
605  source.append(" unsigned int buffer_offset, \n");
606  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
607  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
608  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
609  source.append("{ \n");
610  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
611  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
612  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
613 
615  source.append(" uint2 tmp; \n");
616  source.append(" "); source.append(numeric_string); source.append(" val; \n");
617  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
618  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
619  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
620 
621  source.append(" uint local_index = 0; \n");
622 
623  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
624  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
625 
626  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
627  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
628 
629  //check for carry from previous loop run:
630  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
631  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
632  source.append(" val += inter_results[get_local_size(0)-1]; \n");
633  source.append(" else {\n");
634  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
635  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
636  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
637  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
638  source.append(" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
639  source.append(" } \n");
640  source.append(" } \n");
641 
642  //segmented parallel reduction begin
643  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
644  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
645  source.append(" inter_results[get_local_id(0)] = val; \n");
646  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
647  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
648 
649  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
650  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
651  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
652  source.append(" inter_results[get_local_id(0)] += left; \n");
653  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
654  source.append(" } \n");
655  //segmented parallel reduction end
656 
657  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
658  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
659  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
660  source.append(" Ap[tmp.x] = Ap_entry; \n");
661  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
662  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
663  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
664  source.append(" } \n");
665 
666  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
667  source.append(" } \n"); //for k
668 
669  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
670  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
671  source.append(" Ap[tmp.x] = Ap_entry; \n");
672  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
673  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
674  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
675  source.append(" } \n");
676 
677  // parallel reduction in work group
678  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
679  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
680  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
681  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
682  source.append(" { \n");
683  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
684  source.append(" if (get_local_id(0) < stride) { \n");
685  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
686  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
687  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
688  source.append(" } ");
689  source.append(" } ");
690 
691  // write results to result array
692  source.append(" if (get_local_id(0) == 0) { \n ");
693  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
694  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
695  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
696  source.append(" } \n");
697 
698  source.append("} \n \n");
699 
700 }
701 
702 
703 template<typename StringT>
704 void generate_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
705 {
706  source.append("__kernel void bicgstab_ell_prod( \n");
707  source.append(" __global const unsigned int * coords, \n");
708  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
709  source.append(" unsigned int internal_row_num, \n");
710  source.append(" unsigned int items_per_row, \n");
711  source.append(" unsigned int aligned_items_per_row, \n");
712  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
713  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
714  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
715  source.append(" unsigned int size, \n");
716  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
717  source.append(" unsigned int buffer_size, \n");
718  source.append(" unsigned int buffer_offset, \n");
719  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
720  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
721  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
722  source.append("{ \n");
723  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
724  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
725  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
726  source.append(" uint glb_id = get_global_id(0); \n");
727  source.append(" uint glb_sz = get_global_size(0); \n");
728 
729  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
730  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
731 
732  source.append(" uint offset = row; \n");
733  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
734  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
735  source.append(" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
736  source.append(" } \n");
737 
738  source.append(" Ap[row] = sum; \n");
739  source.append(" inner_prod_ApAp += sum * sum; \n");
740  source.append(" inner_prod_pAp += p[row] * sum; \n");
741  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
742  source.append(" } \n");
743 
744  // parallel reduction in work group
745  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
746  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
747  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
748  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
749  source.append(" { \n");
750  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
751  source.append(" if (get_local_id(0) < stride) { \n");
752  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
753  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
754  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
755  source.append(" } ");
756  source.append(" } ");
757 
758  // write results to result array
759  source.append(" if (get_local_id(0) == 0) { \n ");
760  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
761  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
762  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
763  source.append(" } \n");
764  source.append("} \n \n");
765 }
766 
767 template<typename StringT>
768 void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
769 {
770  source.append("__kernel void bicgstab_sliced_ell_prod( \n");
771  source.append(" __global const unsigned int * columns_per_block, \n");
772  source.append(" __global const unsigned int * column_indices, \n");
773  source.append(" __global const unsigned int * block_start, \n");
774  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
775  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
776  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
777  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
778  source.append(" unsigned int size, \n");
779  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
780  source.append(" unsigned int buffer_size, \n");
781  source.append(" unsigned int buffer_offset, \n");
782  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
783  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
784  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
785  source.append("{ \n");
786  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
787  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
788  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
789  source.append(" uint local_id = get_local_id(0); \n");
790  source.append(" uint local_size = get_local_size(0); \n");
791 
792  source.append(" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
793  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
794 
795  source.append(" uint row = block_idx * local_size + local_id; \n");
796  source.append(" uint offset = block_start[block_idx]; \n");
797  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
798  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
799  source.append(" uint index = offset + item_id * local_size + local_id; \n");
800  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
801  source.append(" sum += val ? (p[column_indices[index]] * val) : 0; \n");
802  source.append(" } \n");
803 
804  source.append(" if (row < size) {\n");
805  source.append(" Ap[row] = sum; \n");
806  source.append(" inner_prod_ApAp += sum * sum; \n");
807  source.append(" inner_prod_pAp += p[row] * sum; \n");
808  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
809  source.append(" } \n");
810  source.append(" } \n");
811 
812  // parallel reduction in work group
813  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
814  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
815  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
816  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
817  source.append(" { \n");
818  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
819  source.append(" if (get_local_id(0) < stride) { \n");
820  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
821  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
822  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
823  source.append(" } ");
824  source.append(" } ");
825 
826  // write results to result array
827  source.append(" if (get_local_id(0) == 0) { \n ");
828  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
829  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
830  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
831  source.append(" } \n");
832  source.append("} \n \n");
833 }
834 
835 template<typename StringT>
836 void generate_hyb_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
837 {
838  source.append("__kernel void bicgstab_hyb_prod( \n");
839  source.append(" const __global int* ell_coords, \n");
840  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
841  source.append(" const __global uint* csr_rows, \n");
842  source.append(" const __global uint* csr_cols, \n");
843  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
844  source.append(" unsigned int internal_row_num, \n");
845  source.append(" unsigned int items_per_row, \n");
846  source.append(" unsigned int aligned_items_per_row, \n");
847  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
848  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
849  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
850  source.append(" unsigned int size, \n");
851  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
852  source.append(" unsigned int buffer_size, \n");
853  source.append(" unsigned int buffer_offset, \n");
854  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
855  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
856  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
857  source.append("{ \n");
858  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
859  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
860  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
861  source.append(" uint glb_id = get_global_id(0); \n");
862  source.append(" uint glb_sz = get_global_size(0); \n");
863 
864  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
865  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
866 
867  source.append(" uint offset = row; \n");
868  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
869  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
870  source.append(" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
871  source.append(" } \n");
872 
873  source.append(" uint col_begin = csr_rows[row]; \n");
874  source.append(" uint col_end = csr_rows[row + 1]; \n");
875 
876  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
877  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
878  source.append(" } \n");
879 
880  source.append(" Ap[row] = sum; \n");
881  source.append(" inner_prod_ApAp += sum * sum; \n");
882  source.append(" inner_prod_pAp += p[row] * sum; \n");
883  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
884  source.append(" } \n");
885 
886  // parallel reduction in work group
887  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
888  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
889  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
890  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
891  source.append(" { \n");
892  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
893  source.append(" if (get_local_id(0) < stride) { \n");
894  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
895  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
896  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
897  source.append(" } ");
898  source.append(" } ");
899 
900  // write results to result array
901  source.append(" if (get_local_id(0) == 0) { \n ");
902  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
903  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
904  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
905  source.append(" } \n");
906  source.append("} \n \n");
907 }
908 
910 
911 
912 template <typename StringType>
913 void generate_pipelined_gmres_gram_schmidt_stage1(StringType & source, std::string const & numeric_string)
914 {
915  source.append("__kernel void gmres_gram_schmidt_1( \n");
916  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
917  source.append(" unsigned int size, \n");
918  source.append(" unsigned int internal_size, \n");
919  source.append(" unsigned int k, \n");
920  source.append(" __global "); source.append(numeric_string); source.append(" * vi_in_vk_buffer, \n");
921  source.append(" unsigned int chunk_size, \n");
922  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
923  source.append("{ \n");
924 
925  source.append(" "); source.append(numeric_string); source.append(" vi_in_vk[7]; \n");
926  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
927 
928  source.append(" unsigned int k_base = 0; \n");
929  source.append(" while (k_base < k) { \n");
930  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
931  source.append(" vi_in_vk[0] = 0;\n");
932  source.append(" vi_in_vk[1] = 0;\n");
933  source.append(" vi_in_vk[2] = 0;\n");
934  source.append(" vi_in_vk[3] = 0;\n");
935  source.append(" vi_in_vk[4] = 0;\n");
936  source.append(" vi_in_vk[5] = 0;\n");
937  source.append(" vi_in_vk[6] = 0;\n");
938  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
939  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
940  source.append(" \n");
941  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
942  source.append(" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
943  source.append(" } \n");
944 
945  // parallel reduction in work group
946  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
947  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
948  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
949  source.append(" { \n");
950  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
951  source.append(" if (get_local_id(0) < stride) { \n");
952  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
953  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
954  source.append(" } ");
955  source.append(" } ");
956 
957  // write results to result array
958  source.append(" if (get_local_id(0) == 0) \n ");
959  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
960  source.append(" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
961 
962  source.append(" k_base += vecs_in_iteration; \n");
963  source.append(" } \n");
964 
965  source.append("} \n");
966 
967 }
968 
969 template <typename StringType>
970 void generate_pipelined_gmres_gram_schmidt_stage2(StringType & source, std::string const & numeric_string)
971 {
972  source.append("__kernel void gmres_gram_schmidt_2( \n");
973  source.append(" __global "); source.append(numeric_string); source.append(" * krylov_basis, \n");
974  source.append(" unsigned int size, \n");
975  source.append(" unsigned int internal_size, \n");
976  source.append(" unsigned int k, \n");
977  source.append(" __global "); source.append(numeric_string); source.append(" const * vi_in_vk_buffer, \n");
978  source.append(" unsigned int chunk_size, \n");
979  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
980  source.append(" unsigned int krylov_dim, \n");
981  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
982  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
983  source.append("{ \n");
984 
985  source.append(" "); source.append(numeric_string); source.append(" vk_dot_vk = 0; \n");
986  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
987 
988  source.append(" unsigned int k_base = 0; \n");
989  source.append(" while (k_base < k) { \n");
990  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
991 
992  // parallel reduction in work group for <v_i, v_k>
993  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
994  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
995  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
996  source.append(" { \n");
997  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
998  source.append(" if (get_local_id(0) < stride) { \n");
999  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1000  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1001  source.append(" } ");
1002  source.append(" } ");
1003  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1004 
1005  // v_k -= <v_i, v_k> v_i:
1006  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1007  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
1008  source.append(" \n");
1009  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1010  source.append(" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1011  source.append(" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1012  source.append(" krylov_basis[i + k * internal_size] = value_vk; \n");
1013  source.append(" } \n");
1014 
1015  // write to R: (to avoid thread divergence, all threads write the same value)
1016  source.append(" if (get_group_id(0) == 0) \n");
1017  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1018  source.append(" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1019  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1020 
1021  source.append(" k_base += vecs_in_iteration; \n");
1022  source.append(" } \n");
1023 
1024  // parallel reduction in work group for <v_k, v_k>
1025  source.append(" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1026  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1027  source.append(" { \n");
1028  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1029  source.append(" if (get_local_id(0) < stride) \n");
1030  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1031  source.append(" } ");
1032 
1033  // write results to result array
1034  source.append(" if (get_local_id(0) == 0) \n ");
1035  source.append(" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1036 
1037  source.append("} \n");
1038 }
1039 
1040 template <typename StringType>
1041 void generate_pipelined_gmres_normalize_vk(StringType & source, std::string const & numeric_string)
1042 {
1043  source.append("__kernel void gmres_normalize_vk( \n");
1044  source.append(" __global "); source.append(numeric_string); source.append(" * vk, \n");
1045  source.append(" unsigned int vk_offset, \n");
1046  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1047  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
1048  source.append(" unsigned int R_offset, \n");
1049  source.append(" __global "); source.append(numeric_string); source.append(" const * inner_prod_buffer, \n");
1050  source.append(" unsigned int chunk_size, \n");
1051  source.append(" __global "); source.append(numeric_string); source.append(" * r_dot_vk_buffer, \n");
1052  source.append(" unsigned int chunk_offset, \n");
1053  source.append(" unsigned int size, \n");
1054  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
1055  source.append("{ \n");
1056 
1057  source.append(" "); source.append(numeric_string); source.append(" norm_vk = 0; \n");
1058 
1059  // parallel reduction in work group to compute <vk, vk>
1060  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1061  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1062  source.append(" { \n");
1063  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1064  source.append(" if (get_local_id(0) < stride) \n");
1065  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1066  source.append(" } ");
1067 
1068  // compute alpha from reduced values:
1069  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1070  source.append(" norm_vk = sqrt(shared_array[0]); \n");
1071 
1072  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
1073  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1074  source.append(" "); source.append(numeric_string); source.append(" value_vk = vk[i + vk_offset] / norm_vk; \n");
1075  source.append(" \n");
1076  source.append(" inner_prod_contrib += residual[i] * value_vk; \n");
1077  source.append(" \n");
1078  source.append(" vk[i + vk_offset] = value_vk; \n");
1079  source.append(" } \n");
1080  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1081 
1082  // parallel reduction in work group
1083  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1084  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1085  source.append(" { \n");
1086  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1087  source.append(" if (get_local_id(0) < stride) \n");
1088  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1089  source.append(" } ");
1090 
1091  // write results to result array
1092  source.append(" if (get_local_id(0) == 0) \n ");
1093  source.append(" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1094  source.append(" if (get_global_id(0) == 0) \n ");
1095  source.append(" R_buffer[R_offset] = norm_vk; \n");
1096 
1097  source.append("} \n");
1098 
1099 }
1100 
1101 template <typename StringType>
1102 void generate_pipelined_gmres_update_result(StringType & source, std::string const & numeric_string)
1103 {
1104  source.append("__kernel void gmres_update_result( \n");
1105  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1106  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1107  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
1108  source.append(" unsigned int size, \n");
1109  source.append(" unsigned int internal_size, \n");
1110  source.append(" __global "); source.append(numeric_string); source.append(" const * coefficients, \n");
1111  source.append(" unsigned int k) \n");
1112  source.append("{ \n");
1113 
1114  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1115  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i] + coefficients[0] * residual[i]; \n");
1116  source.append(" \n");
1117  source.append(" for (unsigned int j = 1; j < k; ++j) \n");
1118  source.append(" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1119  source.append(" \n");
1120  source.append(" result[i] = value_result; \n");
1121  source.append(" } \n");
1122 
1123  source.append("} \n");
1124 }
1125 
1126 
1127 template <typename StringType>
1128 void generate_compressed_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1129 {
1130  source.append("__kernel void gmres_csr_prod( \n");
1131  source.append(" __global const unsigned int * row_indices, \n");
1132  source.append(" __global const unsigned int * column_indices, \n");
1133  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1134  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1135  source.append(" unsigned int offset_p, \n");
1136  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1137  source.append(" unsigned int offset_Ap, \n");
1138  source.append(" unsigned int size, \n");
1139  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1140  source.append(" unsigned int buffer_size, \n");
1141  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1142  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1143  source.append("{ \n");
1144  source.append(" cg_csr_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1145  source.append("} \n \n");
1146 
1147 }
1148 
1149 template <typename StringType>
1150 void generate_coordinate_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1151 {
1152  source.append("__kernel void gmres_coo_prod( \n");
1153  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
1154  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1155  source.append(" __global const uint * group_boundaries, \n");
1156  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1157  source.append(" unsigned int offset_p, \n");
1158  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1159  source.append(" unsigned int offset_Ap, \n");
1160  source.append(" unsigned int size, \n");
1161  source.append(" __local unsigned int * shared_rows, \n");
1162  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
1163  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1164  source.append(" unsigned int buffer_size, \n");
1165  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1166  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1167  source.append("{ \n");
1168  source.append(" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1169  source.append("} \n \n");
1170 
1171 }
1172 
1173 
1174 template <typename StringType>
1175 void generate_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1176 {
1177  source.append("__kernel void gmres_ell_prod( \n");
1178  source.append(" __global const unsigned int * coords, \n");
1179  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1180  source.append(" unsigned int internal_row_num, \n");
1181  source.append(" unsigned int items_per_row, \n");
1182  source.append(" unsigned int aligned_items_per_row, \n");
1183  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1184  source.append(" unsigned int offset_p, \n");
1185  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1186  source.append(" unsigned int offset_Ap, \n");
1187  source.append(" unsigned int size, \n");
1188  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1189  source.append(" unsigned int buffer_size, \n");
1190  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1191  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1192  source.append("{ \n");
1193  source.append(" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1194  source.append("} \n \n");
1195 }
1196 
1197 template <typename StringType>
1198 void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1199 {
1200  source.append("__kernel void gmres_sliced_ell_prod( \n");
1201  source.append(" __global const unsigned int * columns_per_block, \n");
1202  source.append(" __global const unsigned int * column_indices, \n");
1203  source.append(" __global const unsigned int * block_start, \n");
1204  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1205  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1206  source.append(" unsigned int offset_p, \n");
1207  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1208  source.append(" unsigned int offset_Ap, \n");
1209  source.append(" unsigned int size, \n");
1210  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1211  source.append(" unsigned int buffer_size, \n");
1212  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1213  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1214  source.append("{ \n");
1215  source.append(" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1216  source.append("} \n \n");
1217 }
1218 
1219 template <typename StringType>
1220 void generate_hyb_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1221 {
1222  source.append("__kernel void gmres_hyb_prod( \n");
1223  source.append(" const __global int* ell_coords, \n");
1224  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
1225  source.append(" const __global uint* csr_rows, \n");
1226  source.append(" const __global uint* csr_cols, \n");
1227  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
1228  source.append(" unsigned int internal_row_num, \n");
1229  source.append(" unsigned int items_per_row, \n");
1230  source.append(" unsigned int aligned_items_per_row, \n");
1231  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1232  source.append(" unsigned int offset_p, \n");
1233  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1234  source.append(" unsigned int offset_Ap, \n");
1235  source.append(" unsigned int size, \n");
1236  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1237  source.append(" unsigned int buffer_size, \n");
1238  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1239  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1240  source.append("{ \n");
1241  source.append(" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1242  source.append("} \n \n");
1243 }
1244 
1245 
1246 
1247 
1249 
1250 // main kernel class
1252 template<typename NumericT>
1254 {
1255  static std::string program_name()
1256  {
1257  return viennacl::ocl::type_to_string<NumericT>::apply() + "_iterative";
1258  }
1259 
1260  static void init(viennacl::ocl::context & ctx)
1261  {
1262  static std::map<cl_context, bool> init_done;
1263  if (!init_done[ctx.handle().get()])
1264  {
1266  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1267 
1268  std::string source;
1269  source.reserve(1024);
1270 
1271  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1272 
1273  generate_pipelined_cg_vector_update(source, numeric_string);
1274  generate_compressed_matrix_pipelined_cg_prod(source, numeric_string);
1275  generate_coordinate_matrix_pipelined_cg_prod(source, numeric_string);
1276  generate_ell_matrix_pipelined_cg_prod(source, numeric_string);
1277  generate_sliced_ell_matrix_pipelined_cg_prod(source, numeric_string);
1278  generate_hyb_matrix_pipelined_cg_prod(source, numeric_string);
1279 
1280  generate_pipelined_bicgstab_update_s(source, numeric_string);
1281  generate_pipelined_bicgstab_vector_update(source, numeric_string);
1284  generate_ell_matrix_pipelined_bicgstab_prod(source, numeric_string);
1286  generate_hyb_matrix_pipelined_bicgstab_prod(source, numeric_string);
1287 
1288  generate_pipelined_gmres_gram_schmidt_stage1(source, numeric_string);
1289  generate_pipelined_gmres_gram_schmidt_stage2(source, numeric_string);
1290  generate_pipelined_gmres_normalize_vk(source, numeric_string);
1291  generate_pipelined_gmres_update_result(source, numeric_string);
1292  generate_compressed_matrix_pipelined_gmres_prod(source, numeric_string);
1293  generate_coordinate_matrix_pipelined_gmres_prod(source, numeric_string);
1294  generate_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1295  generate_sliced_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1296  generate_hyb_matrix_pipelined_gmres_prod(source, numeric_string);
1297 
1298  std::string prog_name = program_name();
1299  #ifdef VIENNACL_BUILD_INFO
1300  std::cout << "Creating program " << prog_name << std::endl;
1301  #endif
1302  ctx.add_program(source, prog_name);
1303  init_done[ctx.handle().get()] = true;
1304  } //if
1305  } //init
1306 };
1307 
1308 } // namespace kernels
1309 } // namespace opencl
1310 } // namespace linalg
1311 } // namespace viennacl
1312 #endif
1313 
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1253
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:285
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1260
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:768
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:229
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:836
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1041
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:970
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1128
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:704
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:913
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:76
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:478
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:126
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1150
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1220
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:345
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1175
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:591
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:415
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:32
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1102
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:533
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1198