1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
52 template<
typename StringT>
56 source.append(
"__kernel void fft_direct(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
57 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
58 source.append(
" unsigned int size, \n");
59 source.append(
" unsigned int stride, \n");
60 source.append(
" unsigned int batch_num, \n");
61 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
62 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
64 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
65 source.append(
" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
66 source.append(
" "); source.append(numeric_string); source.append(
"2 f = 0.0f; \n");
68 source.append(
" for (unsigned int n = 0; n < size; n++) { \n");
69 source.append(
" "); source.append(numeric_string); source.append(
"2 in = ");
71 source.append(
"input[batch_id * stride + n]; \n");
73 source.append(
"input[n * stride + batch_id]; \n");
75 source.append(
" "); source.append(numeric_string); source.append(
" sn, cs; \n");
76 source.append(
" "); source.append(numeric_string); source.append(
" arg = sign * 2 * NUM_PI * k / size * n; \n");
77 source.append(
" sn = sincos(arg, &cs); \n");
79 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
80 source.append(
" f = f + ("); source.append(numeric_string); source.append(
"2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
81 source.append(
" } \n");
84 source.append(
" output[batch_id * stride + k] = f; \n");
86 source.append(
" output[k * stride + batch_id] = f; \n");
87 source.append(
" } \n");
88 source.append(
" } \n");
89 source.append(
"} \n");
93 source.append(
"__kernel void fft_radix2(__global "); source.append(numeric_string); source.append(
"2* input, \n");
94 source.append(
" unsigned int s, \n");
95 source.append(
" unsigned int bit_size, \n");
96 source.append(
" unsigned int size, \n");
97 source.append(
" unsigned int stride, \n");
98 source.append(
" unsigned int batch_num, \n");
99 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
100 source.append(
" \n");
101 source.append(
" unsigned int ss = 1 << s; \n");
102 source.append(
" unsigned int half_size = size >> 1; \n");
103 source.append(
" \n");
104 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
105 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
106 source.append(
" \n");
107 source.append(
" unsigned int glb_id = get_global_id(0); \n");
108 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
110 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
111 source.append(
" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
112 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
113 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
117 source.append(
" unsigned int offset = batch_id * stride + pos; \n");
118 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
119 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss]; \n");
123 source.append(
" unsigned int offset = pos * stride + batch_id; \n");
124 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
125 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss * stride]; \n");
128 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
130 source.append(
" sn = sincos(arg, &cs); \n");
132 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
134 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
137 source.append(
" input[offset + ss] = in1 - tmp; \n");
139 source.append(
" input[offset + ss * stride] = in1 - tmp; \n");
140 source.append(
" input[offset] = in1 + tmp; \n");
141 source.append(
" } \n");
142 source.append(
" } \n");
143 source.append(
"} \n");
145 source.append(
" \n");
147 source.append(
" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
148 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
149 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
150 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
151 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
152 source.append(
" v = (v >> 16) | (v << 16); \n");
153 source.append(
" \n");
154 source.append(
" v = v >> (32 - bit_size); \n");
155 source.append(
" \n");
156 source.append(
" return v; \n");
157 source.append(
" } \n");
159 source.append(
" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append(
"2* input, \n");
160 source.append(
" __local "); source.append(numeric_string); source.append(
"2* lcl_input, \n");
161 source.append(
" unsigned int bit_size, \n");
162 source.append(
" unsigned int size, \n");
163 source.append(
" unsigned int stride, \n");
164 source.append(
" unsigned int batch_num, \n");
165 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
167 source.append(
" unsigned int grp_id = get_group_id(0); \n");
168 source.append(
" unsigned int grp_num = get_num_groups(0); \n");
170 source.append(
" unsigned int lcl_sz = get_local_size(0); \n");
171 source.append(
" unsigned int lcl_id = get_local_id(0); \n");
172 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
174 source.append(
" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
177 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
178 source.append(
" unsigned int v = get_reorder_num(p, bit_size); \n");
180 source.append(
" lcl_input[v] = input[batch_id * stride + p]; \n");
182 source.append(
" lcl_input[v] = input[p * stride + batch_id]; \n");
183 source.append(
" } \n");
185 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
188 source.append(
" for (unsigned int s = 0; s < bit_size; s++) { \n");
189 source.append(
" unsigned int ss = 1 << s; \n");
191 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
193 source.append(
" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
194 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
195 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
197 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = lcl_input[pos]; \n");
198 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = lcl_input[pos + ss]; \n");
200 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
202 source.append(
" sn = sincos(arg, &cs); \n");
203 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
205 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
207 source.append(
" lcl_input[pos + ss] = in1 - tmp; \n");
208 source.append(
" lcl_input[pos] = in1 + tmp; \n");
209 source.append(
" } \n");
211 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
212 source.append(
" } \n");
215 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
217 source.append(
" input[batch_id * stride + p] = lcl_input[p]; \n");
219 source.append(
" input[p * stride + batch_id] = lcl_input[p]; \n");
220 source.append(
" } \n");
221 source.append(
" } \n");
222 source.append(
" } \n");
224 source.append(
" \n");
230 source.append(
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
231 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
232 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
233 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
234 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
235 source.append(
" v = (v >> 16) | (v << 16); \n");
237 source.append(
" v = v >> (32 - bit_size); \n");
239 source.append(
" return v; \n");
240 source.append(
"} \n");
242 source.append(
"__kernel void fft_reorder(__global "); source.append(numeric_string); source.append(
"2* input, \n");
243 source.append(
" unsigned int bit_size, \n");
244 source.append(
" unsigned int size, \n");
245 source.append(
" unsigned int stride, \n");
246 source.append(
" int batch_num) { \n");
248 source.append(
" unsigned int glb_id = get_global_id(0); \n");
249 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
251 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
252 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
253 source.append(
" unsigned int v = get_reorder_num_2(i, bit_size); \n");
255 source.append(
" if (i < v) {\n");
258 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[batch_id * stride + i]; \n");
259 source.append(
" input[batch_id * stride + i] = input[batch_id * stride + v]; \n");
260 source.append(
" input[batch_id * stride + v] = tmp; \n");
264 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[i * stride + batch_id]; \n");
265 source.append(
" input[i * stride + batch_id] = input[v * stride + batch_id]; \n");
266 source.append(
" input[v * stride + batch_id] = tmp; \n");
268 source.append(
" } \n");
269 source.append(
" } \n");
270 source.append(
" } \n");
271 source.append(
"} \n");
274 template<
typename StringT>
277 source.append(
"__kernel void lu_factorize( \n");
278 source.append(
" __global "); source.append(numeric_string); source.append(
" * matrix, \n");
279 source.append(
" unsigned int matrix_rows, \n");
280 source.append(
" unsigned int matrix_cols, \n");
281 source.append(
" unsigned int matrix_internal_rows, \n");
282 source.append(
" unsigned int matrix_internal_cols) \n");
283 source.append(
"{ \n");
284 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
288 source.append(
" unsigned rowi; \n");
289 source.append(
" unsigned rowk; \n");
290 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
291 source.append(
" { \n");
292 source.append(
" rowi = i * matrix_internal_cols; \n");
293 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
294 source.append(
" { \n");
295 source.append(
" rowk = k * matrix_internal_cols; \n");
296 source.append(
" if (get_global_id(0) == 0) \n");
297 source.append(
" matrix[rowi + k] /= matrix[rowk + k]; \n");
299 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
300 source.append(
" temp = matrix[rowi + k]; \n");
303 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
304 source.append(
" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
308 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
309 source.append(
" { \n");
310 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
311 source.append(
" { \n");
313 source.append(
" if (get_global_id(0) == 0) \n");
314 source.append(
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
316 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317 source.append(
" temp = matrix[i + k*matrix_internal_rows]; \n");
320 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
321 source.append(
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
329 template<
typename StringT>
332 source.append(
"__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append(
"cpu") : source.append(
"gpu"); source.append(
"( \n");
333 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
334 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
335 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
336 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
337 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
340 source.append(
" "); source.append(numeric_string); source.append(
" val, \n");
342 source.append(
" __global const "); source.append(numeric_string); source.append(
" *val, \n");
344 source.append(
" unsigned int options2, \n");
346 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec1, \n");
347 source.append(
" unsigned int start1, \n");
348 source.append(
" unsigned int inc1, \n");
349 source.append(
" unsigned int size1, \n");
351 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
352 source.append(
" unsigned int start2, \n");
353 source.append(
" unsigned int inc2, \n");
354 source.append(
" unsigned int size2) \n");
355 source.append(
"{ \n");
358 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val; \n");
360 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val[0]; \n");
362 source.append(
" if (options2 & (1 << 0)) \n");
363 source.append(
" alpha = -alpha; \n");
365 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
366 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
368 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
369 source.append(
" { \n");
370 source.append(
" "); source.append(numeric_string); source.append(
" tmp = vec1[row * inc1 + start1];");
371 source.append(
" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
372 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
374 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
376 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
377 source.append(
" } \n");
378 source.append(
"} \n");
381 template<
typename StringT>
384 source.append(
"__kernel void triangular_substitute_inplace( \n");
385 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
386 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
387 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
388 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
389 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
390 source.append(
" __global "); source.append(numeric_string); source.append(
" * v, \n");
391 source.append(
" unsigned int v_start, \n");
392 source.append(
" unsigned int v_inc, \n");
393 source.append(
" unsigned int v_size, \n");
394 source.append(
" unsigned int options) \n");
395 source.append(
"{ \n");
396 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
397 source.append(
" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
398 source.append(
" unsigned int transposed_access_A = (options & (1 << 1)); \n");
399 source.append(
" unsigned int is_lower_solve = (options & (1 << 2)); \n");
400 source.append(
" unsigned int row; \n");
401 source.append(
" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n");
402 source.append(
" { \n");
403 source.append(
" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
404 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
405 source.append(
" if (!unit_diagonal_flag) \n");
406 source.append(
" { \n");
407 source.append(
" if (get_global_id(0) == 0) \n");
409 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
411 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
412 source.append(
" } \n");
414 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
416 source.append(
" temp = v[row * v_inc + v_start]; \n");
418 source.append(
" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
419 source.append(
" elim < (is_lower_solve ? A_size1 : row); \n");
420 source.append(
" elim += get_global_size(0)) \n");
423 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
424 source.append(
" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
428 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
429 source.append(
" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
431 source.append(
" } \n");
432 source.append(
"} \n");
435 template <
typename StringT>
438 source.append(
"__kernel void trans_kernel(\n");
439 source.append(
" __global const ");source.append(numeric_string);source.append(
" * A, \n");
440 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
441 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
442 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
443 source.append(
" unsigned int A_stride1, unsigned int A_stride2, \n");
444 source.append(
" __global ");source.append(numeric_string);source.append(
" * B, \n");
445 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
446 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
447 source.append(
" unsigned int B_stride1, unsigned int B_stride2) \n");
448 source.append(
"{ \n");
449 source.append(
" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
450 source.append(
" { \n");
451 source.append(
" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
452 source.append(
" { \n");
454 source.append(
" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
456 source.append(
" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
457 source.append(
" } \n");
458 source.append(
" } \n");
459 source.append(
"} \n");
471 template<
typename NumericT>
476 template<
typename ScalarT1,
typename ScalarT2>
483 handler.
add(prefix +
"0000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
false));
484 handler.
add(prefix +
"1000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
false));
485 handler.
add(prefix +
"0100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
false));
486 handler.
add(prefix +
"1100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
false));
489 handler.
add(prefix +
"0010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
false));
490 handler.
add(prefix +
"1010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
false));
491 handler.
add(prefix +
"0110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
false));
492 handler.
add(prefix +
"1110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
false));
494 handler.
add(prefix +
"0001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
true));
495 handler.
add(prefix +
"1001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
true));
496 handler.
add(prefix +
"0101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
true));
497 handler.
add(prefix +
"1101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
true));
499 handler.
add(prefix +
"0011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
true));
500 handler.
add(prefix +
"1011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
true));
501 handler.
add(prefix +
"0111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
true));
502 handler.
add(prefix +
"1111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
true));
506 template<
typename ScalarT>
516 generate_ambm_impl2(handler, prefix +
"hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
517 generate_ambm_impl2(handler, prefix +
"dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
518 generate_ambm_impl2(handler, prefix +
"hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
519 generate_ambm_impl2(handler, prefix +
"dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
528 std::pair<bool, cl_context> key(is_row_major, h);
529 if (handlers_map.find(key) == handlers_map.end())
536 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
537 ds::execution_handler & handler = handlers_map.at(key);
539 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
540 ds::vector_axpy_template::parameters_type
vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
568 unsigned int hui = 0;
571 generate_ambm_impl(handler,
"assign_", matrix_axpy_params,
scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
572 generate_ambm_impl(handler,
"ip_add_", matrix_axpy_params,
scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
581 return handlers_map.at(key);
587 template<
typename NumericT>
596 std::pair<bool, cl_context> key(is_row_major, h);
597 if (handlers_map.find(key) == handlers_map.end())
602 using namespace scheduler;
608 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
609 ds::execution_handler & handler = handlers_map.at(key);
610 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
632 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
633 if (numeric_string ==
"float" || numeric_string ==
"double")
656 #undef VIENNACL_ADD_UNARY
659 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
662 if (numeric_string ==
"float" || numeric_string ==
"double")
666 #undef VIENNACL_ADD_BINARY
669 return handlers_map.at(key);
675 template<
typename NumericT>
681 static std::map<cl_context, device_specific::execution_handler> handlers_map;
683 if (handlers_map.find(key) == handlers_map.end())
690 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
691 ds::execution_handler & handler = handlers_map.at(key);
696 handler.add(
"mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'T'),
'T'),
scheduler::preset::mat_vec_prod(&A,
true, &x, &y));
697 handler.add(
"mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'N'),
'N'),
scheduler::preset::mat_vec_prod(&A,
false, &x, &y));
700 return handlers_map.at(key);
705 template<
typename NumericT>
713 std::pair<bool, cl_context> key(is_row_major, h);
714 if (handlers_map.find(key) == handlers_map.end())
721 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
722 ds::execution_handler & handler = handlers_map.at(key);
724 ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'N');
725 ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'N');
726 ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'T');
727 ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'T');
742 handler.add(
"prod_NN", ds::matrix_product_template(matrix_product_params_NN,
'N',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
false, beta, &C));
743 handler.add(
"prod_TN", ds::matrix_product_template(matrix_product_params_TN,
'T',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
false, beta, &C));
744 handler.add(
"prod_NT", ds::matrix_product_template(matrix_product_params_NT,
'N',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
true, beta, &C));
745 handler.add(
"prod_TT", ds::matrix_product_template(matrix_product_params_TT,
'T',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
true, beta, &C));
748 return handlers_map.at(key);
754 template<
typename NumericT,
typename LayoutT>
764 static std::map<cl_context, bool> init_done;
772 source.reserve(8192);
774 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
780 if (numeric_string ==
"float" || numeric_string ==
"double")
788 std::string prog_name = program_name();
789 #ifdef VIENNACL_BUILD_INFO
790 std::cout <<
"Creating program " << prog_name << std::endl;
792 ctx.add_program(source, prog_name);
793 init_done[ctx.handle().get()] =
true;
viennacl::ocl::device const & current_device() const
Returns the current device.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Helper class for checking whether a matrix has a row-major layout.
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
static void apply(viennacl::ocl::context const &)
bool with_stride_and_range
const OCL_TYPE & get() const
#define VIENNACL_ADD_BINARY(OPTYPE)
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
static std::string program_name()
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
operation_node_type
Enumeration for identifying the possible operations.
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
std::string type_to_string(viennacl::row_major)
Provides an OpenCL kernel generator.
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
A tag for column-major storage of a dense matrix.
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
const char * operator_string(scheduler::operation_node_type type)
Helper class for converting a type to its string representation.
A tag for row-major storage of a dense matrix.
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.