ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
3 
8 #include "viennacl/ocl/utils.hpp"
9 
15 
18 namespace viennacl
19 {
20 namespace linalg
21 {
22 namespace opencl
23 {
24 namespace kernels
25 {
26 
28 
31 {
32  VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
35 };
36 
39 {
41 
44  std::string assign_op;
47 };
48 
49 
50 
51 
52 template<typename StringT>
53 void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
54 {
55  // naive fourier transform (quadratic complexity, use for reference only)
56  source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
57  source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
58  source.append(" unsigned int size, \n");
59  source.append(" unsigned int stride, \n");
60  source.append(" unsigned int batch_num, \n");
61  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
62  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
63  source.append(" \n");
64  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
65  source.append(" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
66  source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
67  source.append(" \n");
68  source.append(" for (unsigned int n = 0; n < size; n++) { \n");
69  source.append(" "); source.append(numeric_string); source.append("2 in = ");
70  if (is_row_major)
71  source.append("input[batch_id * stride + n]; \n"); //input index here
72  else
73  source.append("input[n * stride + batch_id]; \n"); //input index here
74  source.append(" \n");
75  source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n");
76  source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
77  source.append(" sn = sincos(arg, &cs); \n");
78  source.append(" \n");
79  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
80  source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
81  source.append(" } \n");
82  source.append(" \n");
83  if (is_row_major)
84  source.append(" output[batch_id * stride + k] = f; \n"); // output index here
85  else
86  source.append(" output[k * stride + batch_id] = f; \n"); // output index here
87  source.append(" } \n");
88  source.append(" } \n");
89  source.append("} \n");
90 
91  source.append(" \n");
92 
93  source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
94  source.append(" unsigned int s, \n");
95  source.append(" unsigned int bit_size, \n");
96  source.append(" unsigned int size, \n");
97  source.append(" unsigned int stride, \n");
98  source.append(" unsigned int batch_num, \n");
99  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
100  source.append(" \n");
101  source.append(" unsigned int ss = 1 << s; \n");
102  source.append(" unsigned int half_size = size >> 1; \n");
103  source.append(" \n");
104  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
105  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
106  source.append(" \n");
107  source.append(" unsigned int glb_id = get_global_id(0); \n");
108  source.append(" unsigned int glb_sz = get_global_size(0); \n");
109 
110  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
111  source.append(" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
112  source.append(" unsigned int group = (tid & (ss - 1)); \n");
113  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
114 
115  if (is_row_major)
116  {
117  source.append(" unsigned int offset = batch_id * stride + pos; \n");
118  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
119  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
120  }
121  else
122  {
123  source.append(" unsigned int offset = pos * stride + batch_id; \n");
124  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
125  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
126  }
127 
128  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
129 
130  source.append(" sn = sincos(arg, &cs); \n");
131 
132  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
133 
134  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
135 
136  if (is_row_major)
137  source.append(" input[offset + ss] = in1 - tmp; \n");//index
138  else
139  source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index
140  source.append(" input[offset] = in1 + tmp; \n");//index
141  source.append(" } \n");
142  source.append(" } \n");
143  source.append("} \n");
144 
145  source.append(" \n");
146 
147  source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
148  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
149  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
150  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
151  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
152  source.append(" v = (v >> 16) | (v << 16); \n");
153  source.append(" \n");
154  source.append(" v = v >> (32 - bit_size); \n");
155  source.append(" \n");
156  source.append(" return v; \n");
157  source.append(" } \n");
158 
159  source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
160  source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
161  source.append(" unsigned int bit_size, \n");
162  source.append(" unsigned int size, \n");
163  source.append(" unsigned int stride, \n");
164  source.append(" unsigned int batch_num, \n");
165  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
166 
167  source.append(" unsigned int grp_id = get_group_id(0); \n");
168  source.append(" unsigned int grp_num = get_num_groups(0); \n");
169 
170  source.append(" unsigned int lcl_sz = get_local_size(0); \n");
171  source.append(" unsigned int lcl_id = get_local_id(0); \n");
172  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
173 
174  source.append(" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
175  //unsigned int base_offset = stride * batch_id; \n");
176  //copy chunk of global memory to local \n");
177  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
178  source.append(" unsigned int v = get_reorder_num(p, bit_size); \n");
179  if (is_row_major)
180  source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index
181  else
182  source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index
183  source.append(" } \n");
184 
185  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
186 
187  //performs Cooley-Tukey FFT on local array
188  source.append(" for (unsigned int s = 0; s < bit_size; s++) { \n");
189  source.append(" unsigned int ss = 1 << s; \n");
190 
191  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
192 
193  source.append(" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
194  source.append(" unsigned int group = (tid & (ss - 1)); \n");
195  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
196 
197  source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
198  source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
199 
200  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
201 
202  source.append(" sn = sincos(arg, &cs); \n");
203  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
204 
205  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
206 
207  source.append(" lcl_input[pos + ss] = in1 - tmp; \n");
208  source.append(" lcl_input[pos] = in1 + tmp; \n");
209  source.append(" } \n");
210 
211  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
212  source.append(" } \n");
213 
214  //copy local array back to global memory
215  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
216  if (is_row_major)
217  source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index
218  else
219  source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index
220  source.append(" } \n");
221  source.append(" } \n");
222  source.append(" } \n");
223 
224  source.append(" \n");
225 
226  //
227  // Performs reordering of input data in bit-reversal order
228  // Probably it's better to do in host side,
229  //
230  source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
231  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
232  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
233  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
234  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
235  source.append(" v = (v >> 16) | (v << 16); \n");
236 
237  source.append(" v = v >> (32 - bit_size); \n");
238 
239  source.append(" return v; \n");
240  source.append("} \n");
241 
242  source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
243  source.append(" unsigned int bit_size, \n");
244  source.append(" unsigned int size, \n");
245  source.append(" unsigned int stride, \n");
246  source.append(" int batch_num) { \n");
247 
248  source.append(" unsigned int glb_id = get_global_id(0); \n");
249  source.append(" unsigned int glb_sz = get_global_size(0); \n");
250 
251  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
252  source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
253  source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n");
254 
255  source.append(" if (i < v) {\n");
256  if (is_row_major)
257  {
258  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
259  source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
260  source.append(" input[batch_id * stride + v] = tmp; \n"); //index
261  }
262  else
263  {
264  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
265  source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
266  source.append(" input[v * stride + batch_id] = tmp; \n"); //index
267  }
268  source.append(" } \n");
269  source.append(" } \n");
270  source.append(" } \n");
271  source.append("} \n");
272 }
273 
274 template<typename StringT>
275 void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
276 {
277  source.append("__kernel void lu_factorize( \n");
278  source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n");
279  source.append(" unsigned int matrix_rows, \n");
280  source.append(" unsigned int matrix_cols, \n");
281  source.append(" unsigned int matrix_internal_rows, \n");
282  source.append(" unsigned int matrix_internal_cols) \n");
283  source.append("{ \n");
284  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
285 
286  if (is_row_major)
287  {
288  source.append(" unsigned rowi; \n");
289  source.append(" unsigned rowk; \n");
290  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
291  source.append(" { \n");
292  source.append(" rowi = i * matrix_internal_cols; \n");
293  source.append(" for (unsigned int k=0; k<i; ++k) \n");
294  source.append(" { \n");
295  source.append(" rowk = k * matrix_internal_cols; \n");
296  source.append(" if (get_global_id(0) == 0) \n");
297  source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n");
298 
299  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
300  source.append(" temp = matrix[rowi + k]; \n");
301 
302  //parallel subtraction:
303  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
304  source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
305  }
306  else
307  {
308  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
309  source.append(" { \n");
310  source.append(" for (unsigned int k=0; k<i; ++k) \n");
311  source.append(" { \n");
312 
313  source.append(" if (get_global_id(0) == 0) \n");
314  source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
315 
316  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317  source.append(" temp = matrix[i + k*matrix_internal_rows]; \n");
318 
319  //parallel subtraction:
320  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
321  source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
322  }
323  source.append(" }");
324  source.append(" }");
325  source.append("}");
326 }
327 
328 
329 template<typename StringT>
330 void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
331 {
332  source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
333  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
334  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
335  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
336  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
337  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
338 
339  if (alpha_on_cpu) {
340  source.append(" "); source.append(numeric_string); source.append(" val, \n");
341  } else {
342  source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n");
343  }
344  source.append(" unsigned int options2, \n");
345 
346  source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n");
347  source.append(" unsigned int start1, \n");
348  source.append(" unsigned int inc1, \n");
349  source.append(" unsigned int size1, \n");
350 
351  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
352  source.append(" unsigned int start2, \n");
353  source.append(" unsigned int inc2, \n");
354  source.append(" unsigned int size2) \n");
355  source.append("{ \n");
356 
357  if (alpha_on_cpu) {
358  source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n");
359  } else {
360  source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
361  }
362  source.append(" if (options2 & (1 << 0)) \n");
363  source.append(" alpha = -alpha; \n");
364 
365  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
366  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
367 
368  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
369  source.append(" { \n");
370  source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
371  source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
372  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
373  if (is_row_major)
374  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
375  else
376  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
377  source.append(" } \n");
378  source.append("} \n");
379 }
380 
381 template<typename StringT>
382 void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
383 {
384  source.append("__kernel void triangular_substitute_inplace( \n");
385  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
386  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
387  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
388  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
389  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
390  source.append(" __global "); source.append(numeric_string); source.append(" * v, \n");
391  source.append(" unsigned int v_start, \n");
392  source.append(" unsigned int v_inc, \n");
393  source.append(" unsigned int v_size, \n");
394  source.append(" unsigned int options) \n");
395  source.append("{ \n");
396  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
397  source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
398  source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n");
399  source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n");
400  source.append(" unsigned int row; \n");
401  source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square
402  source.append(" { \n");
403  source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
404  source.append(" if (!unit_diagonal_flag) \n");
405  source.append(" { \n");
406  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
407  source.append(" if (get_global_id(0) == 0) \n");
408  if (is_row_major)
409  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
410  else
411  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
412  source.append(" } \n");
413 
414  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
415 
416  source.append(" temp = v[row * v_inc + v_start]; \n");
417 
418  source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
419  source.append(" elim < (is_lower_solve ? A_size1 : row); \n");
420  source.append(" elim += get_global_size(0)) \n");
421  if (is_row_major)
422  {
423  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
424  source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
425  }
426  else
427  {
428  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
429  source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
430  }
431  source.append(" } \n");
432  source.append("} \n");
433 }
434 
435 template <typename StringT>
436 void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
437 {
438  source.append("__kernel void trans_kernel(\n");
439  source.append(" __global const ");source.append(numeric_string);source.append(" * A, \n");
440  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
441  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
442  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
443  source.append(" unsigned int A_stride1, unsigned int A_stride2, \n");
444  source.append(" __global ");source.append(numeric_string);source.append(" * B, \n");
445  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
446  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
447  source.append(" unsigned int B_stride1, unsigned int B_stride2) \n");
448  source.append("{ \n");
449  source.append(" unsigned int size = A_internal_size2*A_internal_size1; \n");
450  source.append(" for(unsigned int i = get_group_id(0); i < size/get_num_groups(0); i += get_num_groups(0))\n");
451  source.append(" { \n");
452  source.append(" unsigned int matrix_index = i*get_local_size(0)+get_local_id(0); \n");
453  source.append(" unsigned int row = matrix_index / A_internal_size2; \n");
454  source.append(" unsigned int col = matrix_index % A_internal_size2; \n");
455  source.append(" if (row < A_size1 && col < A_size2) \n");
456  source.append(" { \n");
457 
458  if(is_row_major)
459  {
460  source.append(" unsigned int pos = (A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col); \n");
461  source.append(" unsigned int new_pos = (B_start2 + B_stride2 * col) * B_internal_size2 + (B_start1 + B_stride1 * row); \n");
462  source.append(" B[new_pos] = A[pos]; \n");
463  }
464  else
465  {
466  source.append(" unsigned int pos = (A_start1 + A_stride1 * row) + A_internal_size1 * (A_start2 + A_stride2 * col); \n");
467  source.append(" unsigned int new_pos = (B_start2 + B_stride2 * col) + B_internal_size1 * (B_start1 + B_stride1 * row); \n");
468  source.append(" B[new_pos] = A[pos]; \n");
469  }
470  source.append(" } \n");
471  source.append(" } \n");
472  source.append("} \n");
473 }
474 
475 namespace detail
476 {
477  inline std::string type_to_string(viennacl::row_major) { return "row"; }
478  inline std::string type_to_string(viennacl::column_major) { return "col"; }
479 }
480 
482 
484 template<typename NumericT>
485 class matrix
486 {
487 private:
488 
489  template<typename ScalarT1, typename ScalarT2>
490  static void generate_ambm_impl2(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
491  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT1 const * a,
492  viennacl::matrix_base<NumericT> const * z, ScalarT2 const * b)
493  {
494  namespace ds = viennacl::device_specific;
495 
496  handler.add(prefix + "0000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, false));
497  handler.add(prefix + "1000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, false));
498  handler.add(prefix + "0100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, false));
499  handler.add(prefix + "1100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, false));
500  if (b)
501  {
502  handler.add(prefix + "0010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, false));
503  handler.add(prefix + "1010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, false));
504  handler.add(prefix + "0110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, false));
505  handler.add(prefix + "1110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, false));
506 
507  handler.add(prefix + "0001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, true));
508  handler.add(prefix + "1001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, true));
509  handler.add(prefix + "0101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, true));
510  handler.add(prefix + "1101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, true));
511 
512  handler.add(prefix + "0011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, true));
513  handler.add(prefix + "1011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, true));
514  handler.add(prefix + "0111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, true));
515  handler.add(prefix + "1111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, true));
516  }
517  }
518 
519  template<typename ScalarT>
520  static void generate_ambm_impl(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
521  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT const * ha, viennacl::scalar<ScalarT> const * da,
522  viennacl::matrix_base<NumericT> const * z, ScalarT const * hb, viennacl::scalar<ScalarT> const * db)
523  {
524  //x ASSIGN_OP a*y
525  generate_ambm_impl2(handler, prefix + "hm_", parameters, ASSIGN_OP, x, y, ha, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
526  generate_ambm_impl2(handler, prefix + "dm_", parameters, ASSIGN_OP, x, y, da, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
527 
528  //x ASSIGN_OP a*y + b*z
529  generate_ambm_impl2(handler, prefix + "hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
530  generate_ambm_impl2(handler, prefix + "dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
531  generate_ambm_impl2(handler, prefix + "hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
532  generate_ambm_impl2(handler, prefix + "dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
533  }
534 
535 
536 public:
538  {
539  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
540  cl_context h = ctx.handle().get();
541  std::pair<bool, cl_context> key(is_row_major, h);
542  if (handlers_map.find(key) == handlers_map.end())
543  {
545 
546  namespace ds = viennacl::device_specific;
547  viennacl::ocl::device const & device = ctx.current_device();
548  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_row":"matrix_col");
549  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
550  ds::execution_handler & handler = handlers_map.at(key);
551 
552  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
553  ds::vector_axpy_template::parameters_type vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
554 
556  if (is_row_major)
557  {
561  }
562  else
563  {
567  }
568 
578  NumericT ha;
579  NumericT hb;
580  int hi = 0;
581  unsigned int hui = 0;
582 
583  // fully parametrized kernels:
584  generate_ambm_impl(handler, "assign_", matrix_axpy_params, scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
585  generate_ambm_impl(handler, "ip_add_", matrix_axpy_params, scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
586 
587  handler.add("assign_cpu", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::assign_cpu(&A, &M));
588  handler.add("matrix_diag_from_vector", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::matrix_diag_from_vector(&x, &A, hi));
589  handler.add("matrix_row", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_row(&x, &A, hui));
590  handler.add("matrix_column", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_column(&x, &A, hui));
591  handler.add("matrix_diag_to_vector", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_diag_to_vector(&x, &A, hi));
592  handler.add("diagonal_assign_cpu", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::diagonal_assign_cpu(&A, &sx));
593  }
594  return handlers_map.at(key);
595  }
596 };
597 
598 // main kernel class
600 template<typename NumericT>
602 {
603 
604 public:
606  {
607  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
608  cl_context h = ctx.handle().get();
609  std::pair<bool, cl_context> key(is_row_major, h);
610  if (handlers_map.find(key) == handlers_map.end())
611  {
613 
614  namespace ds = viennacl::device_specific;
615  using namespace scheduler;
617 
618  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
619  viennacl::ocl::device const & device = ctx.current_device();
620  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_element_row":"matrix_element_col");
621  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
622  ds::execution_handler & handler = handlers_map.at(key);
623  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
624 
626  if (is_row_major)
627  {
631  }
632  else
633  {
637  }
638 
642 
643 
644  // unary operations
645 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
646  if (numeric_string == "float" || numeric_string == "double")
647  {
664  }
665  else
666  {
668  }
669 #undef VIENNACL_ADD_UNARY
670 
671  // binary operations
672 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
675  if (numeric_string == "float" || numeric_string == "double")
676  {
678  }
679 #undef VIENNACL_ADD_BINARY
680 
681  }
682  return handlers_map.at(key);
683  }
684 };
685 
686 
688 template<typename NumericT>
690 {
691 public:
693  {
694  static std::map<cl_context, device_specific::execution_handler> handlers_map;
695  cl_context key = ctx.handle().get();
696  if (handlers_map.find(key) == handlers_map.end())
697  {
699 
700  namespace ds = viennacl::device_specific;
701  viennacl::ocl::device const & device = ctx.current_device();
702  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_row_wise";
703  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
704  ds::execution_handler & handler = handlers_map.at(key);
705 
709  handler.add("mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'T'), 'T'), scheduler::preset::mat_vec_prod(&A, true, &x, &y));
710  handler.add("mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'N'), 'N'), scheduler::preset::mat_vec_prod(&A, false, &x, &y));
711 
712  }
713  return handlers_map.at(key);
714  }
715 };
716 
718 template<typename NumericT>
720 {
721 public:
723  {
724  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
725  cl_context h = ctx.handle().get();
726  std::pair<bool, cl_context> key(is_row_major, h);
727  if (handlers_map.find(key) == handlers_map.end())
728  {
730 
731  namespace ds = viennacl::device_specific;
732  viennacl::ocl::device const & device = ctx.current_device();
733  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
734  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
735  ds::execution_handler & handler = handlers_map.at(key);
736 
737  ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
738  ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
739  ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
740  ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
741 
743  if (is_row_major)
745  else
747 
748  //Dummy types. The values don't matter for the kernel generation.
752  NumericT alpha = 1;
753  NumericT beta = 0;
754 
755  handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
756  handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
757  handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
758  handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
759 
760  }
761  return handlers_map.at(key);
762  }
763 };
764 
765 // main kernel class
767 template<typename NumericT, typename LayoutT>
769 {
770  static std::string program_name()
771  {
773  }
774 
775  static void init(viennacl::ocl::context & ctx)
776  {
777  static std::map<cl_context, bool> init_done;
778  if (!init_done[ctx.handle().get()])
779  {
781  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
783 
784  std::string source;
785  source.reserve(8192);
786 
787  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
788 
789  // kernels with mostly predetermined skeleton:
790  generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
791  generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
792 
793  if (numeric_string == "float" || numeric_string == "double")
794  {
795  generate_fft(source, numeric_string, is_row_major);
796  generate_lu(source, numeric_string, is_row_major);
797  generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
798  generate_trans_kernel(source, numeric_string, is_row_major);
799  }
800 
801  std::string prog_name = program_name();
802  #ifdef VIENNACL_BUILD_INFO
803  std::cout << "Creating program " << prog_name << std::endl;
804  #endif
805  ctx.add_program(source, prog_name);
806  init_done[ctx.handle().get()] = true;
807  } //if
808  } //init
809 };
810 
811 } // namespace kernels
812 } // namespace opencl
813 } // namespace linalg
814 } // namespace viennacl
815 #endif
816 
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:111
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:719
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:226
Implements a OpenCL platform within ViennaCL.
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:53
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:382
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:346
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:483
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
Definition: matrix.hpp:692
A dense matrix class.
Definition: forwards.h:374
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
Definition: preset.hpp:16
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Definition: matrix.hpp:601
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:722
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
#define VIENNACL_ADD_BINARY(OPTYPE)
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
Definition: preset.hpp:393
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:775
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Definition: matrix.hpp:768
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:436
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: matrix_def.hpp:93
A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependen...
Definition: shared_ptr.hpp:83
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:689
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
Definition: matrix.hpp:38
operation_node_type
Enumeration for identifying the possible operations.
Definition: forwards.h:68
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:275
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Definition: preset.hpp:399
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:340
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:537
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Definition: matrix.hpp:330
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: vector_def.hpp:87
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:130
std::string type_to_string(viennacl::row_major)
Definition: matrix.hpp:477
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:327
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:605
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:333
A tag for column-major storage of a dense matrix.
Definition: forwards.h:320
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Definition: matrix.hpp:30
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:485
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:106
const char * operator_string(scheduler::operation_node_type type)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
A tag for row-major storage of a dense matrix.
Definition: forwards.h:303
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.