ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
spai.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
3 
7 #include "viennacl/ocl/utils.hpp"
8 
11 namespace viennacl
12 {
13 namespace linalg
14 {
15 namespace opencl
16 {
17 namespace kernels
18 {
19 
21 
22 template<typename StringT>
23 void generate_spai_assemble_blocks(StringT & source, std::string const & numeric_string)
24 {
25  source.append("float get_element(__global const unsigned int * row_indices, \n");
26  source.append(" __global const unsigned int * column_indices, \n");
27  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
28  source.append(" unsigned int row, \n");
29  source.append(" unsigned int col) \n");
30  source.append("{ \n");
31  source.append(" unsigned int row_end = row_indices[row+1]; \n");
32  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
33  source.append(" if (column_indices[i] == col) \n");
34  source.append(" return elements[i]; \n");
35  source.append(" if (column_indices[i] > col) \n");
36  source.append(" return 0; \n");
37  source.append(" } \n");
38  source.append(" return 0; \n");
39  source.append("} \n");
40 
41  source.append("void block_assembly(__global const unsigned int * row_indices, \n");
42  source.append(" __global const unsigned int * column_indices, \n");
43  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
44  source.append(" __global const unsigned int * matrix_dimensions, \n");
45  source.append(" __global const unsigned int * set_I, \n");
46  source.append(" __global const unsigned int * set_J, \n");
47  source.append(" unsigned int matrix_ind, \n");
48  source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
49  source.append("{ \n");
50  source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
51  source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
52 
53  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
54  //start row index
55  source.append(" for (unsigned int j = 0; j < row_n; j++){ \n");
56  source.append(" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
57  source.append(" } \n");
58  source.append(" } \n");
59  source.append("} \n");
60 
61  source.append("__kernel void assemble_blocks( \n");
62  source.append(" __global const unsigned int * row_indices, \n");
63  source.append(" __global const unsigned int * column_indices, \n");
64  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
65  source.append(" __global const unsigned int * set_I, \n");
66  source.append(" __global const unsigned int * set_J, \n");
67  source.append(" __global const unsigned int * i_ind, \n");
68  source.append(" __global const unsigned int * j_ind, \n");
69  source.append(" __global const unsigned int * block_ind, \n");
70  source.append(" __global const unsigned int * matrix_dimensions, \n");
71  source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
72  source.append(" __global unsigned int * g_is_update, \n");
73  source.append(" unsigned int block_elems_num) \n");
74  source.append("{ \n");
75  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
76  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
77  source.append(" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
78  source.append(" } \n");
79  source.append(" } \n");
80  source.append(" } \n");
81 }
82 
83 template<typename StringT>
84 void generate_spai_block_bv_assembly(StringT & source, std::string const & numeric_string)
85 {
86  source.append(" void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
87  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
88  source.append(" g_bv_r[i] = g_bv[ i]; \n");
89  source.append(" } \n");
90  source.append(" } \n");
91 
92  source.append(" void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
93  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
94  source.append(" { \n");
95  source.append(" assemble_bv(g_bv_r, g_bv, col_n); \n");
96  source.append(" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
97  source.append(" } \n");
98 
99  source.append(" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
100  source.append(" __global unsigned int * start_bv_ind, \n");
101  source.append(" __global unsigned int * matrix_dimensions, \n");
102  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
103  source.append(" __global unsigned int * start_bv_u_ind, \n");
104  source.append(" __global unsigned int * matrix_dimensions_u, \n");
105  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
106  source.append(" __global unsigned int * start_bv_r_ind, \n");
107  source.append(" __global unsigned int * matrix_dimensions_r, \n");
108  source.append(" __global unsigned int * g_is_update, \n");
109  source.append(" //__local "); source.append(numeric_string); source.append(" * local_gb, \n");
110  source.append(" unsigned int block_elems_num) \n");
111  source.append(" { \n");
112  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
113  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
114  source.append(" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
115  source.append(" } \n");
116  source.append(" } \n");
117  source.append(" } \n");
118 }
119 
120 template<typename StringT>
121 void generate_spai_block_least_squares(StringT & source, std::string const & numeric_string)
122 {
123  source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
124  source.append(" *res = 0.0; \n");
125  source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
126  source.append(" if (j == ind){ \n");
127  source.append(" *res += v[ j]; \n");
128  source.append(" }else{ \n");
129  source.append(" *res += A[ j + ind*row_n]*v[ j]; \n");
130  source.append(" } \n");
131  source.append(" } \n");
132  source.append("} \n");
133 
134  source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
135  source.append(" for (int i = col_n-1; i >= 0; i--) { \n");
136  source.append(" x[ i] = y[ i]; \n");
137  source.append(" for (int j = i+1; j < col_n; ++j) { \n");
138  source.append(" x[ i] -= R[ i + j*row_n]*x[ j]; \n");
139  source.append(" } \n");
140  source.append(" x[i] /= R[ i + i*row_n]; \n");
141  source.append(" } \n");
142  source.append("} \n");
143 
144 
145  source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * y){ \n");
146  source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
147  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
148  source.append(" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
149  source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
150  source.append(" if (i == j){ \n");
151  source.append(" y[ j] -= b_v[ i]*inn_prod; \n");
152  source.append(" } \n");
153  source.append(" else{ \n");
154  source.append(" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
155  source.append(" } \n");
156  source.append(" } \n");
157  source.append(" } \n");
158  source.append(" } \n");
159 
160  source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
161  source.append(" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
162  source.append(" //m_new - is m_v now \n");
163  source.append(" backwardSolve(R, row_n, col_n, y_v, m_v); \n");
164  source.append("} \n");
165 
166  source.append("__kernel void block_least_squares( \n");
167  source.append(" __global "); source.append(numeric_string); source.append(" * global_R, \n");
168  source.append(" __global unsigned int * block_ind, \n");
169  source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
170  source.append(" __global unsigned int * start_bv_inds, \n");
171  source.append(" __global "); source.append(numeric_string); source.append(" * m_v, \n");
172  source.append(" __global "); source.append(numeric_string); source.append(" * y_v, \n");
173  source.append(" __global unsigned int * start_y_inds, \n");
174  source.append(" __global unsigned int * matrix_dimensions, \n");
175  source.append(" __global unsigned int * g_is_update, \n");
176  source.append(" unsigned int block_elems_num) \n");
177  source.append("{ \n");
178  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
179  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
180  source.append(" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
181  source.append(" } \n");
182  source.append(" } \n");
183  source.append("} \n");
184 }
185 
186 template<typename StringT>
187 void generate_spai_block_q_mult(StringT & source, std::string const & numeric_string)
188 {
189  source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
190  source.append(" *res = 0.0; \n");
191  source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
192  source.append(" if (j == ind){ \n");
193  source.append(" *res += v[j]; \n");
194  source.append(" }else{ \n");
195  source.append(" *res += A[j + ind*row_n]*v[j]; \n");
196  source.append(" } \n");
197  source.append(" } \n");
198  source.append("} \n");
199 
200  source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
201  source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
202  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
203  source.append(" custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
204  source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
205  source.append(" if (i == j){ \n");
206  source.append(" y[j] -= b_v[ i]*inn_prod; \n");
207  source.append(" } \n");
208  source.append(" else{ \n");
209  source.append(" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
210  source.append(" } \n");
211  source.append(" } \n");
212  source.append(" } \n");
213  source.append("} \n");
214 
215  source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
216  source.append(" for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
217  source.append(" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
218  source.append(" } \n");
219  source.append("} \n");
220 
221  source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
222  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
223  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
224  source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
225  source.append(" } \n");
226  source.append(" } \n");
227  source.append("} \n");
228 
229  source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
230  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
231  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
232  source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
233  source.append(" } \n");
234  source.append(" } \n");
235  source.append("} \n");
236 
237  source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
238  source.append(" __global unsigned int * block_ind, \n");
239  source.append(" __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
240  source.append(" __global unsigned int *block_ind_u, \n");
241  source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
242  source.append(" __global unsigned int * start_bv_inds, \n");
243  source.append(" __global unsigned int * matrix_dimensions, \n");
244  source.append(" __global unsigned int * matrix_dimensions_u, \n");
245  source.append(" __global unsigned int * g_is_update, \n");
246  source.append(" __local "); source.append(numeric_string); source.append(" * local_R_u, \n");
247  source.append(" unsigned int block_elems_num){ \n");
248  source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
249  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
250  //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
251  source.append(" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
252  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
253  source.append(" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
254  source.append(" matrix_dimensions_u[2*i + 1]); \n");
255  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
256  source.append(" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
257  source.append(" } \n");
258  source.append(" } \n");
259  source.append("} \n");
260 }
261 
262 template<typename StringT>
263 void generate_spai_block_qr(StringT & source, std::string const & numeric_string)
264 {
265  source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
266  source.append(" *res = 0; \n");
267  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
268  source.append(" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
269  source.append(" } \n");
270  source.append("} \n");
271 
272  source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
273  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
274  source.append(" v[i] /= b; \n");
275  source.append(" } \n");
276  source.append("} \n");
277 
278  source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
279  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
280  source.append(" v[i] = A[(beg_ind-1)*n + i]; \n");
281  source.append(" } \n");
282  source.append("} \n");
283 
284 
285  source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
286  source.append(" "); source.append(numeric_string); source.append(" sg; \n");
287  source.append(" dot_prod(A, n, j+1, &sg); \n");
288  source.append(" copy_vector(A, v, j+1, n); \n");
289  source.append(" "); source.append(numeric_string); source.append(" mu; \n");
290  source.append(" v[j] = 1.0; \n");
291  //print_contigious_vector(v, v_start_ind, n);
292  source.append(" if (sg == 0){ \n");
293  source.append(" *b = 0; \n");
294  source.append(" } \n");
295  source.append(" else{ \n");
296  source.append(" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
297  source.append(" if (A[ j*n + j] <= 0){ \n");
298  source.append(" v[j] = A[ j*n + j] - mu; \n");
299  source.append(" }else{ \n");
300  source.append(" v[j] = -sg/(A[ j*n + j] + mu); \n");
301  source.append(" } \n");
302  source.append(" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
303  //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
304  source.append(" vector_div(v, j, v[j], n); \n");
305  //print_contigious_vector(v, v_start_ind, n);
306  source.append(" } \n");
307  source.append("} \n");
308 
309  source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
310  source.append(" for (unsigned int i = start_ind; i < row_num; ++i){ \n");
311  source.append(" *res += A[col_ind*row_num + i]*v[i]; \n");
312  source.append(" } \n");
313  source.append("} \n");
314  //
315  source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
316  source.append(" "); source.append(numeric_string); source.append(" in_prod_res; \n");
317  source.append(" for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
318  source.append(" in_prod_res = 0.0; \n");
319  source.append(" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
320  source.append(" for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
321  source.append(" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
322  source.append(" } \n");
323  source.append(" } \n");
324  source.append("} \n");
325 
326  source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
327  source.append(" for (unsigned int i = ind; i < n; ++i){ \n");
328  source.append(" A[ (ind-1)*n + i] = v[i]; \n");
329  source.append(" } \n");
330  source.append("} \n");
331 
332  source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
333  //matrix_dimensions[0] - number of rows
334  //matrix_dimensions[1] - number of columns
335  source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
336  source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
337 
338  source.append(" if ((col_n == row_n)&&(row_n == 1)){ \n");
339  source.append(" b_v[0] = 0.0; \n");
340  source.append(" return; \n");
341  source.append(" } \n");
342  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
343  source.append(" if (get_local_id(0) == 0){ \n");
344  source.append(" householder_vector(R, i, row_n, v, b_v + i); \n");
345  source.append(" } \n");
346  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
347  source.append(" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
348  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
349  source.append(" if (get_local_id(0) == 0){ \n");
350  source.append(" if (i < matrix_dimensions[2*matrix_ind]){ \n");
351  source.append(" store_householder_vector(R, i+1, row_n, v); \n");
352  source.append(" } \n");
353  source.append(" } \n");
354  source.append(" } \n");
355  source.append("} \n");
356 
357  source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
358  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
359  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
360  source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
361  source.append(" } \n");
362  source.append(" } \n");
363  source.append("} \n");
364  source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
365  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
366  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
367  source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
368  source.append(" } \n");
369  source.append(" } \n");
370  source.append("} \n");
371 
372 
373  source.append("__kernel void block_qr( \n");
374  source.append(" __global "); source.append(numeric_string); source.append("* R, \n");
375  source.append(" __global unsigned int* matrix_dimensions, \n");
376  source.append(" __global "); source.append(numeric_string); source.append("* b_v, \n");
377  source.append(" __global "); source.append(numeric_string); source.append("* v, \n");
378  source.append(" __global unsigned int* start_matrix_inds, \n");
379  source.append(" __global unsigned int* start_bv_inds, \n");
380  source.append(" __global unsigned int* start_v_inds, \n");
381  source.append(" __global unsigned int * g_is_update, \n");
382  source.append(" __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
383  source.append(" unsigned int block_elems_num){ \n");
384  source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
385  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
386  source.append(" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
387  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
388  source.append(" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
389  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
390  source.append(" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
391  source.append(" } \n");
392  source.append(" } \n");
393  source.append("} \n");
394 }
395 
396 template<typename StringT>
397 void generate_spai_block_qr_assembly(StringT & source, std::string const & numeric_string)
398 {
399  source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
400  source.append(" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
401  source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
402  source.append(" unsigned int col_n, unsigned int diff){ \n");
403  source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
404  source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
405  source.append(" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
406  source.append(" } \n");
407  source.append(" } \n");
408  source.append(" } \n");
409 
410  source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
411  source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
412  source.append(" unsigned int diff){ \n");
413  source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
414  source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
415  source.append(" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
416  source.append(" } \n");
417  source.append(" } \n");
418  source.append("} \n");
419 
420  source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
421  source.append(" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
422  source.append(" unsigned int diff = row_n_u - col_n; \n");
423  source.append(" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
424  source.append(" if (diff > 0){ \n");
425  source.append(" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
426  source.append(" } \n");
427  source.append("} \n");
428 
429  source.append("__kernel void block_qr_assembly( \n");
430  source.append(" __global unsigned int * matrix_dimensions, \n");
431  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
432  source.append(" __global unsigned int * block_ind_u, \n");
433  source.append(" __global unsigned int * matrix_dimensions_u, \n");
434  source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
435  source.append(" __global unsigned int * block_ind_u_u, \n");
436  source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
437  source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
438  source.append(" __global unsigned int * block_ind_q, \n");
439  source.append(" __global unsigned int * matrix_dimensions_q, \n");
440  source.append(" __global unsigned int * g_is_update, \n");
441  source.append(" //__local "); source.append(numeric_string); source.append(" * local_R_q, \n");
442  source.append(" unsigned int block_elems_num) \n");
443  source.append("{ \n");
444  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
445  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
446  source.append(" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
447  source.append(" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
448  source.append(" } \n");
449  source.append(" } \n");
450  source.append("} \n");
451 }
452 
453 template<typename StringT>
454 void generate_spai_block_qr_assembly_1(StringT & source, std::string const & numeric_string)
455 {
456  source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
457  source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
458  source.append(" unsigned int col_n, unsigned int diff){ \n");
459  source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
460  source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
461  source.append(" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
462  source.append(" } \n");
463  source.append(" } \n");
464  source.append(" } \n");
465 
466 
467  source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
468  source.append(" unsigned int col_n_u, unsigned int col_n){ \n");
469  source.append(" unsigned int diff = row_n_u - col_n; \n");
470  source.append(" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
471  source.append("} \n");
472 
473  source.append("__kernel void block_qr_assembly_1( \n");
474  source.append(" __global unsigned int * matrix_dimensions, \n");
475  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
476  source.append(" __global unsigned int * block_ind_u, \n");
477  source.append(" __global unsigned int * matrix_dimensions_u, \n");
478  source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
479  source.append(" __global unsigned int * block_ind_q, \n");
480  source.append(" __global unsigned int * matrix_dimensions_q, \n");
481  source.append(" __global unsigned int * g_is_update, \n");
482  source.append(" unsigned int block_elems_num) \n");
483  source.append("{ \n");
484  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
485  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
486  source.append(" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
487  source.append(" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
488  source.append(" } \n");
489  source.append(" } \n");
490  source.append("} \n");
491 }
492 
493 template<typename StringT>
494 void generate_spai_block_r_assembly(StringT & source, std::string const & numeric_string)
495 {
496  source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
497  source.append(" unsigned int row_n, unsigned int col_n) \n");
498  source.append("{ \n");
499  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
500  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
501  source.append(" gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
502  source.append(" } \n");
503  source.append(" } \n");
504  source.append("} \n");
505 
506  source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
507  source.append(" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
508  source.append(" unsigned int col_n) \n");
509  source.append("{ \n");
510  source.append(" for (unsigned int i = 0; i < col_n_u; ++i){ \n");
511  source.append(" for (unsigned int j = 0; j < col_n; ++j){ \n");
512  source.append(" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
513  source.append(" } \n");
514  source.append(" } \n");
515  source.append("} \n");
516 
517 
518  source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
519  source.append(" unsigned int col_n_u_u, unsigned int col_n) \n");
520  source.append("{ \n");
521  source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
522  source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
523  source.append(" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
524  source.append(" } \n");
525  source.append(" } \n");
526  source.append("} \n");
527 
528  source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
529  source.append(" unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
530  source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
531  source.append(" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
532  source.append(" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
533  source.append(" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
534  source.append("} \n");
535 
536 
537  source.append("__kernel void block_r_assembly( \n");
538  source.append(" __global "); source.append(numeric_string); source.append(" * R, \n");
539  source.append(" __global unsigned int * block_ind, \n");
540  source.append(" __global unsigned int * matrix_dimensions, \n");
541  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
542  source.append(" __global unsigned int * block_ind_u, \n");
543  source.append(" __global unsigned int * matrix_dimensions_u, \n");
544  source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
545  source.append(" __global unsigned int * block_ind_u_u, \n");
546  source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
547  source.append(" __global "); source.append(numeric_string); source.append(" * g_R, \n");
548  source.append(" __global unsigned int * block_ind_r, \n");
549  source.append(" __global unsigned int * matrix_dimensions_r, \n");
550  source.append(" __global unsigned int * g_is_update, \n");
551  source.append(" unsigned int block_elems_num) \n");
552  source.append("{ \n");
553  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
554  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
555 
556  source.append(" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
557  source.append(" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
558  source.append(" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
559 
560  source.append(" } \n");
561  source.append(" } \n");
562  source.append("} \n");
563 }
564 
566 
567 // main kernel class
569 template<typename NumericT>
570 struct spai
571 {
572  static std::string program_name()
573  {
575  }
576 
577  static void init(viennacl::ocl::context & ctx)
578  {
579  static std::map<cl_context, bool> init_done;
580  if (!init_done[ctx.handle().get()])
581  {
583  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
584 
585  std::string source;
586  source.reserve(1024);
587 
588  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
589 
590  generate_spai_assemble_blocks(source, numeric_string);
591  generate_spai_block_bv_assembly(source, numeric_string);
592  generate_spai_block_least_squares(source, numeric_string);
593  generate_spai_block_q_mult(source, numeric_string);
594  generate_spai_block_qr(source, numeric_string);
595  generate_spai_block_qr_assembly(source, numeric_string);
596  generate_spai_block_qr_assembly_1(source, numeric_string);
597  generate_spai_block_r_assembly(source, numeric_string);
598 
599  std::string prog_name = program_name();
600  #ifdef VIENNACL_BUILD_INFO
601  std::cout << "Creating program " << prog_name << std::endl;
602  #endif
603  ctx.add_program(source, prog_name);
604  init_done[ctx.handle().get()] = true;
605  } //if
606  } //init
607 };
608 
609 } // namespace kernels
610 } // namespace opencl
611 } // namespace linalg
612 } // namespace viennacl
613 #endif
614 
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
Definition: spai.hpp:570
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
void generate_spai_block_r_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:494
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
void generate_spai_block_qr_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:397
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
void generate_spai_block_q_mult(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:187
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_spai_assemble_blocks(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:23
static std::string program_name()
Definition: spai.hpp:572
void generate_spai_block_qr(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:263
void generate_spai_block_least_squares(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:121
Representation of an OpenCL kernel in ViennaCL.
void generate_spai_block_qr_assembly_1(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:454
static void init(viennacl::ocl::context &ctx)
Definition: spai.hpp:577
void generate_spai_block_bv_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:84
Helper class for converting a type to its string representation.
Definition: utils.hpp:57