ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
3 
8 #include "viennacl/ocl/utils.hpp"
9 
15 
18 namespace viennacl
19 {
20 namespace linalg
21 {
22 namespace opencl
23 {
24 namespace kernels
25 {
26 
28 
31 {
32  VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
35 };
36 
39 {
41 
44  std::string assign_op;
47 };
48 
49 
50 
51 
52 template<typename StringT>
53 void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
54 {
55  // naive fourier transform (quadratic complexity, use for reference only)
56  source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
57  source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
58  source.append(" unsigned int size, \n");
59  source.append(" unsigned int stride, \n");
60  source.append(" unsigned int batch_num, \n");
61  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
62  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
63  source.append(" \n");
64  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
65  source.append(" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
66  source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
67  source.append(" \n");
68  source.append(" for (unsigned int n = 0; n < size; n++) { \n");
69  source.append(" "); source.append(numeric_string); source.append("2 in = ");
70  if (is_row_major)
71  source.append("input[batch_id * stride + n]; \n"); //input index here
72  else
73  source.append("input[n * stride + batch_id]; \n"); //input index here
74  source.append(" \n");
75  source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n");
76  source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
77  source.append(" sn = sincos(arg, &cs); \n");
78  source.append(" \n");
79  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
80  source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
81  source.append(" } \n");
82  source.append(" \n");
83  if (is_row_major)
84  source.append(" output[batch_id * stride + k] = f; \n"); // output index here
85  else
86  source.append(" output[k * stride + batch_id] = f; \n"); // output index here
87  source.append(" } \n");
88  source.append(" } \n");
89  source.append("} \n");
90 
91  source.append(" \n");
92 
93  source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
94  source.append(" unsigned int s, \n");
95  source.append(" unsigned int bit_size, \n");
96  source.append(" unsigned int size, \n");
97  source.append(" unsigned int stride, \n");
98  source.append(" unsigned int batch_num, \n");
99  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
100  source.append(" \n");
101  source.append(" unsigned int ss = 1 << s; \n");
102  source.append(" unsigned int half_size = size >> 1; \n");
103  source.append(" \n");
104  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
105  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
106  source.append(" \n");
107  source.append(" unsigned int glb_id = get_global_id(0); \n");
108  source.append(" unsigned int glb_sz = get_global_size(0); \n");
109 
110  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
111  source.append(" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
112  source.append(" unsigned int group = (tid & (ss - 1)); \n");
113  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
114 
115  if (is_row_major)
116  {
117  source.append(" unsigned int offset = batch_id * stride + pos; \n");
118  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
119  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
120  }
121  else
122  {
123  source.append(" unsigned int offset = pos * stride + batch_id; \n");
124  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
125  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
126  }
127 
128  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
129 
130  source.append(" sn = sincos(arg, &cs); \n");
131 
132  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
133 
134  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
135 
136  if (is_row_major)
137  source.append(" input[offset + ss] = in1 - tmp; \n");//index
138  else
139  source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index
140  source.append(" input[offset] = in1 + tmp; \n");//index
141  source.append(" } \n");
142  source.append(" } \n");
143  source.append("} \n");
144 
145  source.append(" \n");
146 
147  source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
148  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
149  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
150  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
151  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
152  source.append(" v = (v >> 16) | (v << 16); \n");
153  source.append(" \n");
154  source.append(" v = v >> (32 - bit_size); \n");
155  source.append(" \n");
156  source.append(" return v; \n");
157  source.append(" } \n");
158 
159  source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
160  source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
161  source.append(" unsigned int bit_size, \n");
162  source.append(" unsigned int size, \n");
163  source.append(" unsigned int stride, \n");
164  source.append(" unsigned int batch_num, \n");
165  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
166 
167  source.append(" unsigned int grp_id = get_group_id(0); \n");
168  source.append(" unsigned int grp_num = get_num_groups(0); \n");
169 
170  source.append(" unsigned int lcl_sz = get_local_size(0); \n");
171  source.append(" unsigned int lcl_id = get_local_id(0); \n");
172  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
173 
174  source.append(" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
175  //unsigned int base_offset = stride * batch_id; \n");
176  //copy chunk of global memory to local \n");
177  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
178  source.append(" unsigned int v = get_reorder_num(p, bit_size); \n");
179  if (is_row_major)
180  source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index
181  else
182  source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index
183  source.append(" } \n");
184 
185  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
186 
187  //performs Cooley-Tukey FFT on local array
188  source.append(" for (unsigned int s = 0; s < bit_size; s++) { \n");
189  source.append(" unsigned int ss = 1 << s; \n");
190 
191  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
192 
193  source.append(" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
194  source.append(" unsigned int group = (tid & (ss - 1)); \n");
195  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
196 
197  source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
198  source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
199 
200  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
201 
202  source.append(" sn = sincos(arg, &cs); \n");
203  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
204 
205  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
206 
207  source.append(" lcl_input[pos + ss] = in1 - tmp; \n");
208  source.append(" lcl_input[pos] = in1 + tmp; \n");
209  source.append(" } \n");
210 
211  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
212  source.append(" } \n");
213 
214  //copy local array back to global memory
215  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
216  if (is_row_major)
217  source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index
218  else
219  source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index
220  source.append(" } \n");
221  source.append(" } \n");
222  source.append(" } \n");
223 
224  source.append(" \n");
225 
226  //
227  // Performs reordering of input data in bit-reversal order
228  // Probably it's better to do in host side,
229  //
230  source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
231  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
232  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
233  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
234  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
235  source.append(" v = (v >> 16) | (v << 16); \n");
236 
237  source.append(" v = v >> (32 - bit_size); \n");
238 
239  source.append(" return v; \n");
240  source.append("} \n");
241 
242  source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
243  source.append(" unsigned int bit_size, \n");
244  source.append(" unsigned int size, \n");
245  source.append(" unsigned int stride, \n");
246  source.append(" int batch_num) { \n");
247 
248  source.append(" unsigned int glb_id = get_global_id(0); \n");
249  source.append(" unsigned int glb_sz = get_global_size(0); \n");
250 
251  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
252  source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
253  source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n");
254 
255  source.append(" if (i < v) {\n");
256  if (is_row_major)
257  {
258  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
259  source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
260  source.append(" input[batch_id * stride + v] = tmp; \n"); //index
261  }
262  else
263  {
264  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
265  source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
266  source.append(" input[v * stride + batch_id] = tmp; \n"); //index
267  }
268  source.append(" } \n");
269  source.append(" } \n");
270  source.append(" } \n");
271  source.append("} \n");
272 }
273 
274 template<typename StringT>
275 void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
276 {
277  source.append("__kernel void lu_factorize( \n");
278  source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n");
279  source.append(" unsigned int matrix_rows, \n");
280  source.append(" unsigned int matrix_cols, \n");
281  source.append(" unsigned int matrix_internal_rows, \n");
282  source.append(" unsigned int matrix_internal_cols) \n");
283  source.append("{ \n");
284  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
285 
286  if (is_row_major)
287  {
288  source.append(" unsigned rowi; \n");
289  source.append(" unsigned rowk; \n");
290  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
291  source.append(" { \n");
292  source.append(" rowi = i * matrix_internal_cols; \n");
293  source.append(" for (unsigned int k=0; k<i; ++k) \n");
294  source.append(" { \n");
295  source.append(" rowk = k * matrix_internal_cols; \n");
296  source.append(" if (get_global_id(0) == 0) \n");
297  source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n");
298 
299  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
300  source.append(" temp = matrix[rowi + k]; \n");
301 
302  //parallel subtraction:
303  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
304  source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
305  }
306  else
307  {
308  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
309  source.append(" { \n");
310  source.append(" for (unsigned int k=0; k<i; ++k) \n");
311  source.append(" { \n");
312 
313  source.append(" if (get_global_id(0) == 0) \n");
314  source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
315 
316  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317  source.append(" temp = matrix[i + k*matrix_internal_rows]; \n");
318 
319  //parallel subtraction:
320  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
321  source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
322  }
323  source.append(" }");
324  source.append(" }");
325  source.append("}");
326 }
327 
328 
329 template<typename StringT>
330 void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
331 {
332  source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
333  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
334  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
335  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
336  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
337  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
338 
339  if (alpha_on_cpu) {
340  source.append(" "); source.append(numeric_string); source.append(" val, \n");
341  } else {
342  source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n");
343  }
344  source.append(" unsigned int options2, \n");
345 
346  source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n");
347  source.append(" unsigned int start1, \n");
348  source.append(" unsigned int inc1, \n");
349  source.append(" unsigned int size1, \n");
350 
351  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
352  source.append(" unsigned int start2, \n");
353  source.append(" unsigned int inc2, \n");
354  source.append(" unsigned int size2) \n");
355  source.append("{ \n");
356 
357  if (alpha_on_cpu) {
358  source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n");
359  } else {
360  source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
361  }
362  source.append(" if (options2 & (1 << 0)) \n");
363  source.append(" alpha = -alpha; \n");
364 
365  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
366  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
367 
368  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
369  source.append(" { \n");
370  source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
371  source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
372  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
373  if (is_row_major)
374  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
375  else
376  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
377  source.append(" } \n");
378  source.append("} \n");
379 }
380 
381 template<typename StringT>
382 void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
383 {
384  source.append("__kernel void triangular_substitute_inplace( \n");
385  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
386  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
387  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
388  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
389  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
390  source.append(" __global "); source.append(numeric_string); source.append(" * v, \n");
391  source.append(" unsigned int v_start, \n");
392  source.append(" unsigned int v_inc, \n");
393  source.append(" unsigned int v_size, \n");
394  source.append(" unsigned int options) \n");
395  source.append("{ \n");
396  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
397  source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
398  source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n");
399  source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n");
400  source.append(" unsigned int row; \n");
401  source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square
402  source.append(" { \n");
403  source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
404  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
405  source.append(" if (!unit_diagonal_flag) \n");
406  source.append(" { \n");
407  source.append(" if (get_global_id(0) == 0) \n");
408  if (is_row_major)
409  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
410  else
411  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
412  source.append(" } \n");
413 
414  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
415 
416  source.append(" temp = v[row * v_inc + v_start]; \n");
417 
418  source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
419  source.append(" elim < (is_lower_solve ? A_size1 : row); \n");
420  source.append(" elim += get_global_size(0)) \n");
421  if (is_row_major)
422  {
423  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
424  source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
425  }
426  else
427  {
428  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
429  source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
430  }
431  source.append(" } \n");
432  source.append("} \n");
433 }
434 
435 template <typename StringT>
436 void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
437 {
438  source.append("__kernel void trans_kernel(\n");
439  source.append(" __global const ");source.append(numeric_string);source.append(" * A, \n");
440  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
441  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
442  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
443  source.append(" unsigned int A_stride1, unsigned int A_stride2, \n");
444  source.append(" __global ");source.append(numeric_string);source.append(" * B, \n");
445  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
446  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
447  source.append(" unsigned int B_stride1, unsigned int B_stride2) \n");
448  source.append("{ \n");
449  source.append(" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
450  source.append(" { \n");
451  source.append(" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
452  source.append(" { \n");
453  if(is_row_major)
454  source.append(" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
455  else
456  source.append(" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
457  source.append(" } \n");
458  source.append(" } \n");
459  source.append("} \n");
460 }
461 
462 namespace detail
463 {
464  inline std::string type_to_string(viennacl::row_major) { return "row"; }
465  inline std::string type_to_string(viennacl::column_major) { return "col"; }
466 }
467 
469 
471 template<typename NumericT>
472 class matrix
473 {
474 private:
475 
476  template<typename ScalarT1, typename ScalarT2>
477  static void generate_ambm_impl2(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
478  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT1 const * a,
479  viennacl::matrix_base<NumericT> const * z, ScalarT2 const * b)
480  {
481  namespace ds = viennacl::device_specific;
482 
483  handler.add(prefix + "0000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, false));
484  handler.add(prefix + "1000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, false));
485  handler.add(prefix + "0100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, false));
486  handler.add(prefix + "1100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, false));
487  if (b)
488  {
489  handler.add(prefix + "0010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, false));
490  handler.add(prefix + "1010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, false));
491  handler.add(prefix + "0110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, false));
492  handler.add(prefix + "1110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, false));
493 
494  handler.add(prefix + "0001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, true));
495  handler.add(prefix + "1001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, true));
496  handler.add(prefix + "0101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, true));
497  handler.add(prefix + "1101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, true));
498 
499  handler.add(prefix + "0011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, true));
500  handler.add(prefix + "1011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, true));
501  handler.add(prefix + "0111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, true));
502  handler.add(prefix + "1111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, true));
503  }
504  }
505 
506  template<typename ScalarT>
507  static void generate_ambm_impl(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
508  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT const * ha, viennacl::scalar<ScalarT> const * da,
509  viennacl::matrix_base<NumericT> const * z, ScalarT const * hb, viennacl::scalar<ScalarT> const * db)
510  {
511  //x ASSIGN_OP a*y
512  generate_ambm_impl2(handler, prefix + "hm_", parameters, ASSIGN_OP, x, y, ha, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
513  generate_ambm_impl2(handler, prefix + "dm_", parameters, ASSIGN_OP, x, y, da, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
514 
515  //x ASSIGN_OP a*y + b*z
516  generate_ambm_impl2(handler, prefix + "hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
517  generate_ambm_impl2(handler, prefix + "dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
518  generate_ambm_impl2(handler, prefix + "hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
519  generate_ambm_impl2(handler, prefix + "dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
520  }
521 
522 
523 public:
525  {
526  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
527  cl_context h = ctx.handle().get();
528  std::pair<bool, cl_context> key(is_row_major, h);
529  if (handlers_map.find(key) == handlers_map.end())
530  {
532 
533  namespace ds = viennacl::device_specific;
534  viennacl::ocl::device const & device = ctx.current_device();
535  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_row":"matrix_col");
536  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
537  ds::execution_handler & handler = handlers_map.at(key);
538 
539  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
540  ds::vector_axpy_template::parameters_type vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
541 
543  if (is_row_major)
544  {
548  }
549  else
550  {
554  }
555 
565  NumericT ha;
566  NumericT hb;
567  int hi = 0;
568  unsigned int hui = 0;
569 
570  // fully parametrized kernels:
571  generate_ambm_impl(handler, "assign_", matrix_axpy_params, scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
572  generate_ambm_impl(handler, "ip_add_", matrix_axpy_params, scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
573 
574  handler.add("assign_cpu", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::assign_cpu(&A, &M));
575  handler.add("matrix_diag_from_vector", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::matrix_diag_from_vector(&x, &A, hi));
576  handler.add("matrix_row", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_row(&x, &A, hui));
577  handler.add("matrix_column", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_column(&x, &A, hui));
578  handler.add("matrix_diag_to_vector", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_diag_to_vector(&x, &A, hi));
579  handler.add("diagonal_assign_cpu", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::diagonal_assign_cpu(&A, &sx));
580  }
581  return handlers_map.at(key);
582  }
583 };
584 
585 // main kernel class
587 template<typename NumericT>
589 {
590 
591 public:
593  {
594  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
595  cl_context h = ctx.handle().get();
596  std::pair<bool, cl_context> key(is_row_major, h);
597  if (handlers_map.find(key) == handlers_map.end())
598  {
600 
601  namespace ds = viennacl::device_specific;
602  using namespace scheduler;
604 
605  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
606  viennacl::ocl::device const & device = ctx.current_device();
607  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_element_row":"matrix_element_col");
608  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
609  ds::execution_handler & handler = handlers_map.at(key);
610  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
611 
613  if (is_row_major)
614  {
618  }
619  else
620  {
624  }
625 
629 
630 
631  // unary operations
632 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
633  if (numeric_string == "float" || numeric_string == "double")
634  {
651  }
652  else
653  {
655  }
656 #undef VIENNACL_ADD_UNARY
657 
658  // binary operations
659 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
662  if (numeric_string == "float" || numeric_string == "double")
663  {
665  }
666 #undef VIENNACL_ADD_BINARY
667 
668  }
669  return handlers_map.at(key);
670  }
671 };
672 
673 
675 template<typename NumericT>
677 {
678 public:
680  {
681  static std::map<cl_context, device_specific::execution_handler> handlers_map;
682  cl_context key = ctx.handle().get();
683  if (handlers_map.find(key) == handlers_map.end())
684  {
686 
687  namespace ds = viennacl::device_specific;
688  viennacl::ocl::device const & device = ctx.current_device();
689  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_row_wise";
690  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
691  ds::execution_handler & handler = handlers_map.at(key);
692 
696  handler.add("mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'T'), 'T'), scheduler::preset::mat_vec_prod(&A, true, &x, &y));
697  handler.add("mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'N'), 'N'), scheduler::preset::mat_vec_prod(&A, false, &x, &y));
698 
699  }
700  return handlers_map.at(key);
701  }
702 };
703 
705 template<typename NumericT>
707 {
708 public:
710  {
711  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
712  cl_context h = ctx.handle().get();
713  std::pair<bool, cl_context> key(is_row_major, h);
714  if (handlers_map.find(key) == handlers_map.end())
715  {
717 
718  namespace ds = viennacl::device_specific;
719  viennacl::ocl::device const & device = ctx.current_device();
720  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
721  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
722  ds::execution_handler & handler = handlers_map.at(key);
723 
724  ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
725  ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
726  ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
727  ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
728 
730  if (is_row_major)
732  else
734 
735  //Dummy types. The values don't matter for the kernel generation.
739  NumericT alpha = 1;
740  NumericT beta = 0;
741 
742  handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
743  handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
744  handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
745  handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
746 
747  }
748  return handlers_map.at(key);
749  }
750 };
751 
752 // main kernel class
754 template<typename NumericT, typename LayoutT>
756 {
757  static std::string program_name()
758  {
760  }
761 
762  static void init(viennacl::ocl::context & ctx)
763  {
764  static std::map<cl_context, bool> init_done;
765  if (!init_done[ctx.handle().get()])
766  {
768  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
770 
771  std::string source;
772  source.reserve(8192);
773 
774  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
775 
776  // kernels with mostly predetermined skeleton:
777  generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
778  generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
779 
780  if (numeric_string == "float" || numeric_string == "double")
781  {
782  generate_fft(source, numeric_string, is_row_major);
783  generate_lu(source, numeric_string, is_row_major);
784  generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
785  generate_trans_kernel(source, numeric_string, is_row_major);
786  }
787 
788  std::string prog_name = program_name();
789  #ifdef VIENNACL_BUILD_INFO
790  std::cout << "Creating program " << prog_name << std::endl;
791  #endif
792  ctx.add_program(source, prog_name);
793  init_done[ctx.handle().get()] = true;
794  } //if
795  } //init
796 };
797 
798 } // namespace kernels
799 } // namespace opencl
800 } // namespace linalg
801 } // namespace viennacl
802 #endif
803 
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:111
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:706
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:226
Implements a OpenCL platform within ViennaCL.
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:53
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:382
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:346
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:483
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
Definition: matrix.hpp:679
A dense matrix class.
Definition: forwards.h:374
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
Definition: preset.hpp:16
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Definition: matrix.hpp:588
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:709
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
#define VIENNACL_ADD_BINARY(OPTYPE)
Definition: blas3.hpp:36
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
Definition: preset.hpp:393
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:762
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Definition: matrix.hpp:755
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:436
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: matrix_def.hpp:93
A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependen...
Definition: shared_ptr.hpp:83
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:676
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
Definition: matrix.hpp:38
operation_node_type
Enumeration for identifying the possible operations.
Definition: forwards.h:68
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:275
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Definition: preset.hpp:399
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:340
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:524
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Definition: matrix.hpp:330
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: vector_def.hpp:87
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:130
std::string type_to_string(viennacl::row_major)
Definition: matrix.hpp:464
Provides an OpenCL kernel generator.
Definition: common.hpp:34
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:327
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:592
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:333
A tag for column-major storage of a dense matrix.
Definition: forwards.h:320
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Definition: matrix.hpp:30
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:472
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:106
const char * operator_string(scheduler::operation_node_type type)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
A tag for row-major storage of a dense matrix.
Definition: forwards.h:303
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.