1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
52 template<
typename StringT>
56 source.append(
"__kernel void fft_direct(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
57 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
58 source.append(
" unsigned int size, \n");
59 source.append(
" unsigned int stride, \n");
60 source.append(
" unsigned int batch_num, \n");
61 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
62 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
64 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
65 source.append(
" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
66 source.append(
" "); source.append(numeric_string); source.append(
"2 f = 0.0f; \n");
68 source.append(
" for (unsigned int n = 0; n < size; n++) { \n");
69 source.append(
" "); source.append(numeric_string); source.append(
"2 in = ");
71 source.append(
"input[batch_id * stride + n]; \n");
73 source.append(
"input[n * stride + batch_id]; \n");
75 source.append(
" "); source.append(numeric_string); source.append(
" sn, cs; \n");
76 source.append(
" "); source.append(numeric_string); source.append(
" arg = sign * 2 * NUM_PI * k / size * n; \n");
77 source.append(
" sn = sincos(arg, &cs); \n");
79 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
80 source.append(
" f = f + ("); source.append(numeric_string); source.append(
"2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
81 source.append(
" } \n");
84 source.append(
" output[batch_id * stride + k] = f; \n");
86 source.append(
" output[k * stride + batch_id] = f; \n");
87 source.append(
" } \n");
88 source.append(
" } \n");
89 source.append(
"} \n");
93 source.append(
"__kernel void fft_radix2(__global "); source.append(numeric_string); source.append(
"2* input, \n");
94 source.append(
" unsigned int s, \n");
95 source.append(
" unsigned int bit_size, \n");
96 source.append(
" unsigned int size, \n");
97 source.append(
" unsigned int stride, \n");
98 source.append(
" unsigned int batch_num, \n");
99 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
100 source.append(
" \n");
101 source.append(
" unsigned int ss = 1 << s; \n");
102 source.append(
" unsigned int half_size = size >> 1; \n");
103 source.append(
" \n");
104 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
105 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
106 source.append(
" \n");
107 source.append(
" unsigned int glb_id = get_global_id(0); \n");
108 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
110 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
111 source.append(
" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
112 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
113 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
117 source.append(
" unsigned int offset = batch_id * stride + pos; \n");
118 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
119 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss]; \n");
123 source.append(
" unsigned int offset = pos * stride + batch_id; \n");
124 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
125 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss * stride]; \n");
128 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
130 source.append(
" sn = sincos(arg, &cs); \n");
132 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
134 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
137 source.append(
" input[offset + ss] = in1 - tmp; \n");
139 source.append(
" input[offset + ss * stride] = in1 - tmp; \n");
140 source.append(
" input[offset] = in1 + tmp; \n");
141 source.append(
" } \n");
142 source.append(
" } \n");
143 source.append(
"} \n");
145 source.append(
" \n");
147 source.append(
" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
148 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
149 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
150 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
151 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
152 source.append(
" v = (v >> 16) | (v << 16); \n");
153 source.append(
" \n");
154 source.append(
" v = v >> (32 - bit_size); \n");
155 source.append(
" \n");
156 source.append(
" return v; \n");
157 source.append(
" } \n");
159 source.append(
" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append(
"2* input, \n");
160 source.append(
" __local "); source.append(numeric_string); source.append(
"2* lcl_input, \n");
161 source.append(
" unsigned int bit_size, \n");
162 source.append(
" unsigned int size, \n");
163 source.append(
" unsigned int stride, \n");
164 source.append(
" unsigned int batch_num, \n");
165 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
167 source.append(
" unsigned int grp_id = get_group_id(0); \n");
168 source.append(
" unsigned int grp_num = get_num_groups(0); \n");
170 source.append(
" unsigned int lcl_sz = get_local_size(0); \n");
171 source.append(
" unsigned int lcl_id = get_local_id(0); \n");
172 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
174 source.append(
" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
177 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
178 source.append(
" unsigned int v = get_reorder_num(p, bit_size); \n");
180 source.append(
" lcl_input[v] = input[batch_id * stride + p]; \n");
182 source.append(
" lcl_input[v] = input[p * stride + batch_id]; \n");
183 source.append(
" } \n");
185 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
188 source.append(
" for (unsigned int s = 0; s < bit_size; s++) { \n");
189 source.append(
" unsigned int ss = 1 << s; \n");
191 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
193 source.append(
" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
194 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
195 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
197 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = lcl_input[pos]; \n");
198 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = lcl_input[pos + ss]; \n");
200 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
202 source.append(
" sn = sincos(arg, &cs); \n");
203 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
205 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
207 source.append(
" lcl_input[pos + ss] = in1 - tmp; \n");
208 source.append(
" lcl_input[pos] = in1 + tmp; \n");
209 source.append(
" } \n");
211 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
212 source.append(
" } \n");
215 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
217 source.append(
" input[batch_id * stride + p] = lcl_input[p]; \n");
219 source.append(
" input[p * stride + batch_id] = lcl_input[p]; \n");
220 source.append(
" } \n");
221 source.append(
" } \n");
222 source.append(
" } \n");
224 source.append(
" \n");
230 source.append(
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
231 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
232 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
233 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
234 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
235 source.append(
" v = (v >> 16) | (v << 16); \n");
237 source.append(
" v = v >> (32 - bit_size); \n");
239 source.append(
" return v; \n");
240 source.append(
"} \n");
242 source.append(
"__kernel void fft_reorder(__global "); source.append(numeric_string); source.append(
"2* input, \n");
243 source.append(
" unsigned int bit_size, \n");
244 source.append(
" unsigned int size, \n");
245 source.append(
" unsigned int stride, \n");
246 source.append(
" int batch_num) { \n");
248 source.append(
" unsigned int glb_id = get_global_id(0); \n");
249 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
251 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
252 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
253 source.append(
" unsigned int v = get_reorder_num_2(i, bit_size); \n");
255 source.append(
" if (i < v) {\n");
258 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[batch_id * stride + i]; \n");
259 source.append(
" input[batch_id * stride + i] = input[batch_id * stride + v]; \n");
260 source.append(
" input[batch_id * stride + v] = tmp; \n");
264 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[i * stride + batch_id]; \n");
265 source.append(
" input[i * stride + batch_id] = input[v * stride + batch_id]; \n");
266 source.append(
" input[v * stride + batch_id] = tmp; \n");
268 source.append(
" } \n");
269 source.append(
" } \n");
270 source.append(
" } \n");
271 source.append(
"} \n");
274 template<
typename StringT>
277 source.append(
"__kernel void lu_factorize( \n");
278 source.append(
" __global "); source.append(numeric_string); source.append(
" * matrix, \n");
279 source.append(
" unsigned int matrix_rows, \n");
280 source.append(
" unsigned int matrix_cols, \n");
281 source.append(
" unsigned int matrix_internal_rows, \n");
282 source.append(
" unsigned int matrix_internal_cols) \n");
283 source.append(
"{ \n");
284 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
288 source.append(
" unsigned rowi; \n");
289 source.append(
" unsigned rowk; \n");
290 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
291 source.append(
" { \n");
292 source.append(
" rowi = i * matrix_internal_cols; \n");
293 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
294 source.append(
" { \n");
295 source.append(
" rowk = k * matrix_internal_cols; \n");
296 source.append(
" if (get_global_id(0) == 0) \n");
297 source.append(
" matrix[rowi + k] /= matrix[rowk + k]; \n");
299 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
300 source.append(
" temp = matrix[rowi + k]; \n");
303 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
304 source.append(
" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
308 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
309 source.append(
" { \n");
310 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
311 source.append(
" { \n");
313 source.append(
" if (get_global_id(0) == 0) \n");
314 source.append(
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
316 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317 source.append(
" temp = matrix[i + k*matrix_internal_rows]; \n");
320 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
321 source.append(
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
329 template<
typename StringT>
332 source.append(
"__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append(
"cpu") : source.append(
"gpu"); source.append(
"( \n");
333 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
334 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
335 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
336 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
337 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
340 source.append(
" "); source.append(numeric_string); source.append(
" val, \n");
342 source.append(
" __global const "); source.append(numeric_string); source.append(
" *val, \n");
344 source.append(
" unsigned int options2, \n");
346 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec1, \n");
347 source.append(
" unsigned int start1, \n");
348 source.append(
" unsigned int inc1, \n");
349 source.append(
" unsigned int size1, \n");
351 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
352 source.append(
" unsigned int start2, \n");
353 source.append(
" unsigned int inc2, \n");
354 source.append(
" unsigned int size2) \n");
355 source.append(
"{ \n");
358 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val; \n");
360 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val[0]; \n");
362 source.append(
" if (options2 & (1 << 0)) \n");
363 source.append(
" alpha = -alpha; \n");
365 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
366 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
368 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
369 source.append(
" { \n");
370 source.append(
" "); source.append(numeric_string); source.append(
" tmp = vec1[row * inc1 + start1];");
371 source.append(
" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
372 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
374 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
376 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
377 source.append(
" } \n");
378 source.append(
"} \n");
381 template<
typename StringT>
384 source.append(
"__kernel void triangular_substitute_inplace( \n");
385 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
386 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
387 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
388 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
389 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
390 source.append(
" __global "); source.append(numeric_string); source.append(
" * v, \n");
391 source.append(
" unsigned int v_start, \n");
392 source.append(
" unsigned int v_inc, \n");
393 source.append(
" unsigned int v_size, \n");
394 source.append(
" unsigned int options) \n");
395 source.append(
"{ \n");
396 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
397 source.append(
" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
398 source.append(
" unsigned int transposed_access_A = (options & (1 << 1)); \n");
399 source.append(
" unsigned int is_lower_solve = (options & (1 << 2)); \n");
400 source.append(
" unsigned int row; \n");
401 source.append(
" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n");
402 source.append(
" { \n");
403 source.append(
" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
404 source.append(
" if (!unit_diagonal_flag) \n");
405 source.append(
" { \n");
406 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
407 source.append(
" if (get_global_id(0) == 0) \n");
409 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
411 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
412 source.append(
" } \n");
414 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
416 source.append(
" temp = v[row * v_inc + v_start]; \n");
418 source.append(
" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
419 source.append(
" elim < (is_lower_solve ? A_size1 : row); \n");
420 source.append(
" elim += get_global_size(0)) \n");
423 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
424 source.append(
" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
428 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
429 source.append(
" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
431 source.append(
" } \n");
432 source.append(
"} \n");
435 template <
typename StringT>
438 source.append(
"__kernel void trans_kernel(\n");
439 source.append(
" __global const ");source.append(numeric_string);source.append(
" * A, \n");
440 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
441 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
442 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
443 source.append(
" unsigned int A_stride1, unsigned int A_stride2, \n");
444 source.append(
" __global ");source.append(numeric_string);source.append(
" * B, \n");
445 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
446 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
447 source.append(
" unsigned int B_stride1, unsigned int B_stride2) \n");
448 source.append(
"{ \n");
449 source.append(
" unsigned int size = A_internal_size2*A_internal_size1; \n");
450 source.append(
" for(unsigned int i = get_group_id(0); i < size/get_num_groups(0); i += get_num_groups(0))\n");
451 source.append(
" { \n");
452 source.append(
" unsigned int matrix_index = i*get_local_size(0)+get_local_id(0); \n");
453 source.append(
" unsigned int row = matrix_index / A_internal_size2; \n");
454 source.append(
" unsigned int col = matrix_index % A_internal_size2; \n");
455 source.append(
" if (row < A_size1 && col < A_size2) \n");
456 source.append(
" { \n");
460 source.append(
" unsigned int pos = (A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col); \n");
461 source.append(
" unsigned int new_pos = (B_start2 + B_stride2 * col) * B_internal_size2 + (B_start1 + B_stride1 * row); \n");
462 source.append(
" B[new_pos] = A[pos]; \n");
466 source.append(
" unsigned int pos = (A_start1 + A_stride1 * row) + A_internal_size1 * (A_start2 + A_stride2 * col); \n");
467 source.append(
" unsigned int new_pos = (B_start2 + B_stride2 * col) + B_internal_size1 * (B_start1 + B_stride1 * row); \n");
468 source.append(
" B[new_pos] = A[pos]; \n");
470 source.append(
" } \n");
471 source.append(
" } \n");
472 source.append(
"} \n");
484 template<
typename NumericT>
489 template<
typename ScalarT1,
typename ScalarT2>
494 namespace ds = viennacl::device_specific;
496 handler.
add(prefix +
"0000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
false));
497 handler.
add(prefix +
"1000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
false));
498 handler.
add(prefix +
"0100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
false));
499 handler.
add(prefix +
"1100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
false));
502 handler.
add(prefix +
"0010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
false));
503 handler.
add(prefix +
"1010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
false));
504 handler.
add(prefix +
"0110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
false));
505 handler.
add(prefix +
"1110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
false));
507 handler.
add(prefix +
"0001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
true));
508 handler.
add(prefix +
"1001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
true));
509 handler.
add(prefix +
"0101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
true));
510 handler.
add(prefix +
"1101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
true));
512 handler.
add(prefix +
"0011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
true));
513 handler.
add(prefix +
"1011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
true));
514 handler.
add(prefix +
"0111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
true));
515 handler.
add(prefix +
"1111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
true));
519 template<
typename ScalarT>
529 generate_ambm_impl2(handler, prefix +
"hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
530 generate_ambm_impl2(handler, prefix +
"dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
531 generate_ambm_impl2(handler, prefix +
"hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
532 generate_ambm_impl2(handler, prefix +
"dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
541 std::pair<bool, cl_context> key(is_row_major, h);
542 if (handlers_map.find(key) == handlers_map.end())
546 namespace ds = viennacl::device_specific;
549 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
550 ds::execution_handler & handler = handlers_map.at(key);
552 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
553 ds::vector_axpy_template::parameters_type
vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
581 unsigned int hui = 0;
584 generate_ambm_impl(handler,
"assign_", matrix_axpy_params,
scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
585 generate_ambm_impl(handler,
"ip_add_", matrix_axpy_params,
scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
594 return handlers_map.at(key);
600 template<
typename NumericT>
609 std::pair<bool, cl_context> key(is_row_major, h);
610 if (handlers_map.find(key) == handlers_map.end())
614 namespace ds = viennacl::device_specific;
615 using namespace scheduler;
621 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
622 ds::execution_handler & handler = handlers_map.at(key);
623 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
645 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
646 if (numeric_string ==
"float" || numeric_string ==
"double")
669 #undef VIENNACL_ADD_UNARY
672 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
675 if (numeric_string ==
"float" || numeric_string ==
"double")
679 #undef VIENNACL_ADD_BINARY
682 return handlers_map.at(key);
688 template<
typename NumericT>
694 static std::map<cl_context, device_specific::execution_handler> handlers_map;
696 if (handlers_map.find(key) == handlers_map.end())
700 namespace ds = viennacl::device_specific;
703 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
704 ds::execution_handler & handler = handlers_map.at(key);
709 handler.add(
"mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'T'),
'T'),
scheduler::preset::mat_vec_prod(&A,
true, &x, &y));
710 handler.add(
"mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'N'),
'N'),
scheduler::preset::mat_vec_prod(&A,
false, &x, &y));
713 return handlers_map.at(key);
718 template<
typename NumericT>
726 std::pair<bool, cl_context> key(is_row_major, h);
727 if (handlers_map.find(key) == handlers_map.end())
731 namespace ds = viennacl::device_specific;
734 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
735 ds::execution_handler & handler = handlers_map.at(key);
737 ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'N');
738 ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'N');
739 ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'T');
740 ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'T');
755 handler.add(
"prod_NN", ds::matrix_product_template(matrix_product_params_NN,
'N',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
false, beta, &C));
756 handler.add(
"prod_TN", ds::matrix_product_template(matrix_product_params_TN,
'T',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
false, beta, &C));
757 handler.add(
"prod_NT", ds::matrix_product_template(matrix_product_params_NT,
'N',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
true, beta, &C));
758 handler.add(
"prod_TT", ds::matrix_product_template(matrix_product_params_TT,
'T',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
true, beta, &C));
761 return handlers_map.at(key);
767 template<
typename NumericT,
typename LayoutT>
777 static std::map<cl_context, bool> init_done;
785 source.reserve(8192);
787 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
793 if (numeric_string ==
"float" || numeric_string ==
"double")
801 std::string prog_name = program_name();
802 #ifdef VIENNACL_BUILD_INFO
803 std::cout <<
"Creating program " << prog_name << std::endl;
805 ctx.add_program(source, prog_name);
806 init_done[ctx.handle().get()] =
true;
viennacl::ocl::device const & current_device() const
Returns the current device.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Helper class for checking whether a matrix has a row-major layout.
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
static void apply(viennacl::ocl::context const &)
bool with_stride_and_range
const OCL_TYPE & get() const
#define VIENNACL_ADD_BINARY(OPTYPE)
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
static std::string program_name()
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
operation_node_type
Enumeration for identifying the possible operations.
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
std::string type_to_string(viennacl::row_major)
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
A tag for column-major storage of a dense matrix.
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
const char * operator_string(scheduler::operation_node_type type)
Helper class for converting a type to its string representation.
A tag for row-major storage of a dense matrix.
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.