1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
34 template<
typename NumericT>
37 unsigned int A_start1,
unsigned int A_start2,
38 unsigned int A_internal_size1,
unsigned int A_internal_size2,
39 unsigned int A_size1,
unsigned int A_size2,
40 unsigned int A_stride1,
unsigned int A_stride2,
43 unsigned int B_start1,
unsigned int B_start2,
44 unsigned int B_internal_size1,
unsigned int B_internal_size2,
45 unsigned int B_stride1,
unsigned int B_stride2,
48 for(
unsigned int row = blockIdx.x;
row<A_size1;
row+=gridDim.x)
50 for(
unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
53 B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 *
row)] = A[(A_start1 + A_stride1 *
row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
55 B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 *
row) + (A_start2 + A_stride2 * col) * A_internal_size1];
65 template<
typename NumericT>
68 unsigned int A_start1,
unsigned int A_start2,
69 unsigned int A_inc1,
unsigned int A_inc2,
70 unsigned int A_size1,
unsigned int A_size2,
71 unsigned int A_internal_size1,
unsigned int A_internal_size2,
74 unsigned int options2,
76 unsigned int B_start1,
unsigned int B_start2,
77 unsigned int B_inc1,
unsigned int B_inc2,
78 unsigned int B_internal_size1,
unsigned int B_internal_size2)
80 NumericT alpha = fac2;
81 if (options2 & (1 << 0))
84 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
85 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
87 if (options2 & (1 << 1))
89 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
90 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
91 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
95 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
96 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
97 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
102 template<
typename NumericT>
105 unsigned int A_start1,
unsigned int A_start2,
106 unsigned int A_inc1,
unsigned int A_inc2,
107 unsigned int A_size1,
unsigned int A_size2,
108 unsigned int A_internal_size1,
unsigned int A_internal_size2,
110 const NumericT * fac2,
111 unsigned int options2,
113 unsigned int B_start1,
unsigned int B_start2,
114 unsigned int B_inc1,
unsigned int B_inc2,
115 unsigned int B_internal_size1,
unsigned int B_internal_size2)
117 NumericT alpha = *fac2;
118 if (options2 & (1 << 0))
121 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
122 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
124 if (options2 & (1 << 1))
126 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
127 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
128 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
132 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
133 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
134 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
144 template<
typename NumericT>
147 unsigned int A_start1,
unsigned int A_start2,
148 unsigned int A_inc1,
unsigned int A_inc2,
149 unsigned int A_size1,
unsigned int A_size2,
150 unsigned int A_internal_size1,
unsigned int A_internal_size2,
153 unsigned int options2,
155 unsigned int B_start1,
unsigned int B_start2,
156 unsigned int B_inc1,
unsigned int B_inc2,
157 unsigned int B_internal_size1,
unsigned int B_internal_size2,
160 unsigned int options3,
162 unsigned int C_start1,
unsigned int C_start2,
163 unsigned int C_inc1,
unsigned int C_inc2,
164 unsigned int C_internal_size1,
unsigned int C_internal_size2)
166 NumericT alpha = fac2;
167 if (options2 & (1 << 0))
170 NumericT beta = fac3;
171 if (options3 & (1 << 0))
174 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
175 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
177 if (options2 & (1 << 1))
179 if (options3 & (1 << 1))
181 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
182 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
183 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
184 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
185 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
189 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
190 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
191 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
192 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
193 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
198 if (options3 & (1 << 1))
200 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
201 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
202 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
203 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
204 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
208 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
209 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
210 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
211 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
212 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
219 template<
typename NumericT>
222 unsigned int A_start1,
unsigned int A_start2,
223 unsigned int A_inc1,
unsigned int A_inc2,
224 unsigned int A_size1,
unsigned int A_size2,
225 unsigned int A_internal_size1,
unsigned int A_internal_size2,
228 unsigned int options2,
230 unsigned int B_start1,
unsigned int B_start2,
231 unsigned int B_inc1,
unsigned int B_inc2,
232 unsigned int B_internal_size1,
unsigned int B_internal_size2,
234 const NumericT * fac3,
235 unsigned int options3,
237 unsigned int C_start1,
unsigned int C_start2,
238 unsigned int C_inc1,
unsigned int C_inc2,
239 unsigned int C_internal_size1,
unsigned int C_internal_size2)
241 NumericT alpha = fac2;
242 if (options2 & (1 << 0))
245 NumericT beta = *fac3;
246 if (options3 & (1 << 0))
249 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
250 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
252 if (options2 & (1 << 1))
254 if (options3 & (1 << 1))
256 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
257 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
258 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
259 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
260 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
264 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
265 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
266 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
267 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
268 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
273 if (options3 & (1 << 1))
275 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
276 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
277 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
278 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
279 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
283 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
284 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
285 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
286 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
287 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
293 template<
typename NumericT>
296 unsigned int A_start1,
unsigned int A_start2,
297 unsigned int A_inc1,
unsigned int A_inc2,
298 unsigned int A_size1,
unsigned int A_size2,
299 unsigned int A_internal_size1,
unsigned int A_internal_size2,
301 const NumericT * fac2,
302 unsigned int options2,
304 unsigned int B_start1,
unsigned int B_start2,
305 unsigned int B_inc1,
unsigned int B_inc2,
306 unsigned int B_internal_size1,
unsigned int B_internal_size2,
309 unsigned int options3,
311 unsigned int C_start1,
unsigned int C_start2,
312 unsigned int C_inc1,
unsigned int C_inc2,
313 unsigned int C_internal_size1,
unsigned int C_internal_size2)
315 NumericT alpha = *fac2;
316 if (options2 & (1 << 0))
319 NumericT beta = fac3;
320 if (options3 & (1 << 0))
323 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
324 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
326 if (options2 & (1 << 1))
328 if (options3 & (1 << 1))
330 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
331 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
332 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
333 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
334 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
338 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
339 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
340 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
341 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
342 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
347 if (options3 & (1 << 1))
349 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
350 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
351 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
352 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
353 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
357 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
358 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
359 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
360 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
361 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
368 template<
typename NumericT>
371 unsigned int A_start1,
unsigned int A_start2,
372 unsigned int A_inc1,
unsigned int A_inc2,
373 unsigned int A_size1,
unsigned int A_size2,
374 unsigned int A_internal_size1,
unsigned int A_internal_size2,
376 const NumericT * fac2,
377 unsigned int options2,
379 unsigned int B_start1,
unsigned int B_start2,
380 unsigned int B_inc1,
unsigned int B_inc2,
381 unsigned int B_internal_size1,
unsigned int B_internal_size2,
383 const NumericT * fac3,
384 unsigned int options3,
386 unsigned int C_start1,
unsigned int C_start2,
387 unsigned int C_inc1,
unsigned int C_inc2,
388 unsigned int C_internal_size1,
unsigned int C_internal_size2)
390 NumericT alpha = *fac2;
391 if (options2 & (1 << 0))
394 NumericT beta = *fac3;
395 if (options3 & (1 << 0))
398 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
399 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
401 if (options2 & (1 << 1))
403 if (options3 & (1 << 1))
405 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
406 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
407 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
408 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
409 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
413 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
414 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
415 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
416 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
417 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
422 if (options3 & (1 << 1))
424 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
425 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
426 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
427 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
428 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
432 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
433 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
434 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
435 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
436 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
447 template<
typename NumericT>
450 unsigned int A_start1,
unsigned int A_start2,
451 unsigned int A_inc1,
unsigned int A_inc2,
452 unsigned int A_size1,
unsigned int A_size2,
453 unsigned int A_internal_size1,
unsigned int A_internal_size2,
456 unsigned int options2,
458 unsigned int B_start1,
unsigned int B_start2,
459 unsigned int B_inc1,
unsigned int B_inc2,
460 unsigned int B_internal_size1,
unsigned int B_internal_size2,
463 unsigned int options3,
465 unsigned int C_start1,
unsigned int C_start2,
466 unsigned int C_inc1,
unsigned int C_inc2,
467 unsigned int C_internal_size1,
unsigned int C_internal_size2)
469 NumericT alpha = fac2;
470 if (options2 & (1 << 0))
473 NumericT beta = fac3;
474 if (options3 & (1 << 0))
477 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
478 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
480 if (options2 & (1 << 1))
482 if (options3 & (1 << 1))
484 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
485 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
486 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
487 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
488 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
492 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
493 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
494 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
495 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
496 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
501 if (options3 & (1 << 1))
503 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
504 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
505 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
506 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
507 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
511 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
512 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
513 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
514 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
515 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
522 template<
typename NumericT>
525 unsigned int A_start1,
unsigned int A_start2,
526 unsigned int A_inc1,
unsigned int A_inc2,
527 unsigned int A_size1,
unsigned int A_size2,
528 unsigned int A_internal_size1,
unsigned int A_internal_size2,
531 unsigned int options2,
533 unsigned int B_start1,
unsigned int B_start2,
534 unsigned int B_inc1,
unsigned int B_inc2,
535 unsigned int B_internal_size1,
unsigned int B_internal_size2,
537 const NumericT * fac3,
538 unsigned int options3,
540 unsigned int C_start1,
unsigned int C_start2,
541 unsigned int C_inc1,
unsigned int C_inc2,
542 unsigned int C_internal_size1,
unsigned int C_internal_size2)
544 NumericT alpha = fac2;
545 if (options2 & (1 << 0))
548 NumericT beta = *fac3;
549 if (options3 & (1 << 0))
552 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
553 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
555 if (options2 & (1 << 1))
557 if (options3 & (1 << 1))
559 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
560 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
561 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
562 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
563 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
567 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
568 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
569 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
570 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
571 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
576 if (options3 & (1 << 1))
578 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
579 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
580 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
581 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
582 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
586 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
587 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
588 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
589 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
590 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
596 template<
typename NumericT>
599 unsigned int A_start1,
unsigned int A_start2,
600 unsigned int A_inc1,
unsigned int A_inc2,
601 unsigned int A_size1,
unsigned int A_size2,
602 unsigned int A_internal_size1,
unsigned int A_internal_size2,
604 const NumericT * fac2,
605 unsigned int options2,
607 unsigned int B_start1,
unsigned int B_start2,
608 unsigned int B_inc1,
unsigned int B_inc2,
609 unsigned int B_internal_size1,
unsigned int B_internal_size2,
612 unsigned int options3,
614 unsigned int C_start1,
unsigned int C_start2,
615 unsigned int C_inc1,
unsigned int C_inc2,
616 unsigned int C_internal_size1,
unsigned int C_internal_size2)
618 NumericT alpha = *fac2;
619 if (options2 & (1 << 0))
622 NumericT beta = fac3;
623 if (options3 & (1 << 0))
626 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
627 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
629 if (options2 & (1 << 1))
631 if (options3 & (1 << 1))
633 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
634 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
635 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
636 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
637 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
641 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
642 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
643 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
644 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
645 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
650 if (options3 & (1 << 1))
652 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
653 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
654 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
655 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
656 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
660 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
661 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
662 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
663 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
664 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
671 template<
typename NumericT>
674 unsigned int A_start1,
unsigned int A_start2,
675 unsigned int A_inc1,
unsigned int A_inc2,
676 unsigned int A_size1,
unsigned int A_size2,
677 unsigned int A_internal_size1,
unsigned int A_internal_size2,
679 const NumericT * fac2,
680 unsigned int options2,
682 unsigned int B_start1,
unsigned int B_start2,
683 unsigned int B_inc1,
unsigned int B_inc2,
684 unsigned int B_internal_size1,
unsigned int B_internal_size2,
686 const NumericT * fac3,
687 unsigned int options3,
689 unsigned int C_start1,
unsigned int C_start2,
690 unsigned int C_inc1,
unsigned int C_inc2,
691 unsigned int C_internal_size1,
unsigned int C_internal_size2)
693 NumericT alpha = *fac2;
694 if (options2 & (1 << 0))
697 NumericT beta = *fac3;
698 if (options3 & (1 << 0))
701 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
702 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
704 if (options2 & (1 << 1))
706 if (options3 & (1 << 1))
708 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
709 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
710 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
711 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
712 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
716 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
717 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
718 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
719 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
720 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
725 if (options3 & (1 << 1))
727 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
728 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
729 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
730 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
731 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
735 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
736 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
737 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
738 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
739 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
748 template<
typename NumericT>
751 unsigned int A_start1,
unsigned int A_start2,
752 unsigned int A_inc1,
unsigned int A_inc2,
753 unsigned int A_size1,
unsigned int A_size2,
754 unsigned int A_internal_size1,
unsigned int A_internal_size2,
757 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
758 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
760 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
761 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
762 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
766 template<
typename NumericT>
769 unsigned int A_start1,
unsigned int A_start2,
770 unsigned int A_inc1,
unsigned int A_inc2,
771 unsigned int A_size1,
unsigned int A_size2,
772 unsigned int A_internal_size1,
unsigned int A_internal_size2,
775 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
777 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
778 A[(
row * A_inc1 + A_start1) * A_internal_size2 +
row * A_inc2 + A_start2] = alpha;
785 template<
typename NumericT>
788 unsigned int A_start1,
unsigned int A_start2,
789 unsigned int A_inc1,
unsigned int A_inc2,
790 unsigned int A_size1,
unsigned int A_size2,
791 unsigned int A_internal_size1,
unsigned int A_internal_size2,
794 unsigned int B_start1,
unsigned int B_start2,
795 unsigned int B_inc1,
unsigned int B_inc2,
796 unsigned int B_internal_size1,
unsigned int B_internal_size2,
799 unsigned int C_start1,
unsigned int C_start2,
800 unsigned int C_inc1,
unsigned int C_inc2,
801 unsigned int C_internal_size1,
unsigned int C_internal_size2,
803 unsigned int op_type)
805 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
806 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
810 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
811 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
812 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
813 = pow(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
814 C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
816 else if (op_type == 1)
818 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
819 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
820 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
821 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
822 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
824 else if (op_type == 0)
826 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
827 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
828 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
829 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
830 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
834 template<
typename NumericT>
837 unsigned int A_start1,
unsigned int A_start2,
838 unsigned int A_inc1,
unsigned int A_inc2,
839 unsigned int A_size1,
unsigned int A_size2,
840 unsigned int A_internal_size1,
unsigned int A_internal_size2,
843 unsigned int B_start1,
unsigned int B_start2,
844 unsigned int B_inc1,
unsigned int B_inc2,
845 unsigned int B_internal_size1,
unsigned int B_internal_size2,
848 unsigned int C_start1,
unsigned int C_start2,
849 unsigned int C_inc1,
unsigned int C_inc2,
850 unsigned int C_internal_size1,
unsigned int C_internal_size2,
852 unsigned int op_type)
854 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
855 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
859 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
860 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
861 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
862 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
863 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
865 else if (op_type == 0)
867 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
868 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
869 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
870 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
871 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
880 template<
typename NumericT>
883 unsigned int A_start1,
unsigned int A_start2,
884 unsigned int A_inc1,
unsigned int A_inc2,
885 unsigned int A_size1,
unsigned int A_size2,
886 unsigned int A_internal_size1,
unsigned int A_internal_size2,
889 unsigned int B_start1,
unsigned int B_start2,
890 unsigned int B_inc1,
unsigned int B_inc2,
891 unsigned int B_internal_size1,
unsigned int B_internal_size2)
893 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
894 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
896 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
897 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
898 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
903 template<
typename NumericT>
906 unsigned int A_start1,
unsigned int A_start2,
907 unsigned int A_inc1,
unsigned int A_inc2,
908 unsigned int A_size1,
unsigned int A_size2,
909 unsigned int A_internal_size1,
unsigned int A_internal_size2,
912 unsigned int B_start1,
unsigned int B_start2,
913 unsigned int B_inc1,
unsigned int B_inc2,
914 unsigned int B_internal_size1,
unsigned int B_internal_size2)
916 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
917 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
919 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
920 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
921 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
926 template<
typename NumericT>
929 unsigned int A_start1,
unsigned int A_start2,
930 unsigned int A_inc1,
unsigned int A_inc2,
931 unsigned int A_size1,
unsigned int A_size2,
932 unsigned int A_internal_size1,
unsigned int A_internal_size2,
935 unsigned int B_start1,
unsigned int B_start2,
936 unsigned int B_inc1,
unsigned int B_inc2,
937 unsigned int B_internal_size1,
unsigned int B_internal_size2)
939 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
940 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
942 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
943 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
944 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
949 template<
typename NumericT>
952 unsigned int A_start1,
unsigned int A_start2,
953 unsigned int A_inc1,
unsigned int A_inc2,
954 unsigned int A_size1,
unsigned int A_size2,
955 unsigned int A_internal_size1,
unsigned int A_internal_size2,
958 unsigned int B_start1,
unsigned int B_start2,
959 unsigned int B_inc1,
unsigned int B_inc2,
960 unsigned int B_internal_size1,
unsigned int B_internal_size2)
962 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
963 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
965 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
966 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
967 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
972 template<
typename NumericT>
975 unsigned int A_start1,
unsigned int A_start2,
976 unsigned int A_inc1,
unsigned int A_inc2,
977 unsigned int A_size1,
unsigned int A_size2,
978 unsigned int A_internal_size1,
unsigned int A_internal_size2,
981 unsigned int B_start1,
unsigned int B_start2,
982 unsigned int B_inc1,
unsigned int B_inc2,
983 unsigned int B_internal_size1,
unsigned int B_internal_size2)
985 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
986 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
988 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
989 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
990 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
995 template<
typename NumericT>
998 unsigned int A_start1,
unsigned int A_start2,
999 unsigned int A_inc1,
unsigned int A_inc2,
1000 unsigned int A_size1,
unsigned int A_size2,
1001 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1004 unsigned int B_start1,
unsigned int B_start2,
1005 unsigned int B_inc1,
unsigned int B_inc2,
1006 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1008 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1009 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1011 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1012 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1013 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1018 template<
typename NumericT>
1021 unsigned int A_start1,
unsigned int A_start2,
1022 unsigned int A_inc1,
unsigned int A_inc2,
1023 unsigned int A_size1,
unsigned int A_size2,
1024 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1027 unsigned int B_start1,
unsigned int B_start2,
1028 unsigned int B_inc1,
unsigned int B_inc2,
1029 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1031 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1032 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1034 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1035 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1036 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1041 template<
typename NumericT>
1044 unsigned int A_start1,
unsigned int A_start2,
1045 unsigned int A_inc1,
unsigned int A_inc2,
1046 unsigned int A_size1,
unsigned int A_size2,
1047 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1050 unsigned int B_start1,
unsigned int B_start2,
1051 unsigned int B_inc1,
unsigned int B_inc2,
1052 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1054 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1055 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1057 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1058 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1059 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1064 template<
typename NumericT>
1067 unsigned int A_start1,
unsigned int A_start2,
1068 unsigned int A_inc1,
unsigned int A_inc2,
1069 unsigned int A_size1,
unsigned int A_size2,
1070 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1073 unsigned int B_start1,
unsigned int B_start2,
1074 unsigned int B_inc1,
unsigned int B_inc2,
1075 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1077 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1078 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1080 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1081 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1082 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1087 template<
typename NumericT>
1090 unsigned int A_start1,
unsigned int A_start2,
1091 unsigned int A_inc1,
unsigned int A_inc2,
1092 unsigned int A_size1,
unsigned int A_size2,
1093 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1096 unsigned int B_start1,
unsigned int B_start2,
1097 unsigned int B_inc1,
unsigned int B_inc2,
1098 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1100 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1101 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1103 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1104 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1105 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1110 template<
typename NumericT>
1113 unsigned int A_start1,
unsigned int A_start2,
1114 unsigned int A_inc1,
unsigned int A_inc2,
1115 unsigned int A_size1,
unsigned int A_size2,
1116 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1119 unsigned int B_start1,
unsigned int B_start2,
1120 unsigned int B_inc1,
unsigned int B_inc2,
1121 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1123 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1124 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1126 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1127 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1128 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1133 template<
typename NumericT>
1136 unsigned int A_start1,
unsigned int A_start2,
1137 unsigned int A_inc1,
unsigned int A_inc2,
1138 unsigned int A_size1,
unsigned int A_size2,
1139 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1142 unsigned int B_start1,
unsigned int B_start2,
1143 unsigned int B_inc1,
unsigned int B_inc2,
1144 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1146 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1147 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1149 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1150 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1151 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1156 template<
typename NumericT>
1159 unsigned int A_start1,
unsigned int A_start2,
1160 unsigned int A_inc1,
unsigned int A_inc2,
1161 unsigned int A_size1,
unsigned int A_size2,
1162 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1165 unsigned int B_start1,
unsigned int B_start2,
1166 unsigned int B_inc1,
unsigned int B_inc2,
1167 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1169 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1170 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1172 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1173 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1174 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1179 template<
typename NumericT>
1182 unsigned int A_start1,
unsigned int A_start2,
1183 unsigned int A_inc1,
unsigned int A_inc2,
1184 unsigned int A_size1,
unsigned int A_size2,
1185 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1188 unsigned int B_start1,
unsigned int B_start2,
1189 unsigned int B_inc1,
unsigned int B_inc2,
1190 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1192 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1193 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1195 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1196 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1197 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1202 template<
typename NumericT>
1205 unsigned int A_start1,
unsigned int A_start2,
1206 unsigned int A_inc1,
unsigned int A_inc2,
1207 unsigned int A_size1,
unsigned int A_size2,
1208 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1211 unsigned int B_start1,
unsigned int B_start2,
1212 unsigned int B_inc1,
unsigned int B_inc2,
1213 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1215 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1216 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1218 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1219 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1220 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1225 template<
typename NumericT>
1228 unsigned int A_start1,
unsigned int A_start2,
1229 unsigned int A_inc1,
unsigned int A_inc2,
1230 unsigned int A_size1,
unsigned int A_size2,
1231 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1234 unsigned int B_start1,
unsigned int B_start2,
1235 unsigned int B_inc1,
unsigned int B_inc2,
1236 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1238 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1239 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1241 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1242 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1243 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1248 template<
typename NumericT>
1251 unsigned int A_start1,
unsigned int A_start2,
1252 unsigned int A_inc1,
unsigned int A_inc2,
1253 unsigned int A_size1,
unsigned int A_size2,
1254 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1257 unsigned int B_start1,
unsigned int B_start2,
1258 unsigned int B_inc1,
unsigned int B_inc2,
1259 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1261 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1262 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1264 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1265 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1266 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1275 template<
typename NumericT>
1278 unsigned int A_row_start,
1279 unsigned int A_col_start,
1280 unsigned int A_row_inc,
1281 unsigned int A_col_inc,
1282 unsigned int A_row_size,
1283 unsigned int A_col_size,
1284 unsigned int A_internal_rows,
1285 unsigned int A_internal_cols,
1287 unsigned int v_start,
1289 unsigned int v_size,
1291 unsigned int result_start,
1292 unsigned int result_inc,
1293 unsigned int result_size)
1295 __shared__ NumericT work[128];
1297 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1298 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1299 unsigned int lid = threadIdx.x;
1301 for (
unsigned int row = row_gid;
row < A_row_size;
row += gridDim.x)
1304 for (
unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1305 dot_prod += A[(
row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1311 work[lid] += work[lid+
stride];
1315 result[
row * result_inc + result_start] = work[0];
1320 template<
typename NumericT>
1323 unsigned int A_row_start,
1324 unsigned int A_col_start,
1325 unsigned int A_row_inc,
1326 unsigned int A_col_inc,
1327 unsigned int A_row_size,
1328 unsigned int A_col_size,
1329 unsigned int A_internal_rows,
1330 unsigned int A_internal_cols,
1332 unsigned int v_start,
1334 unsigned int v_size,
1336 unsigned int result_start,
1337 unsigned int result_inc,
1338 unsigned int result_size)
1340 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_col_size;
row += gridDim.x * blockDim.x)
1343 for (
unsigned int col = 0; col < A_row_size; ++col)
1344 dot_prod += A[(
row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1345 result[
row * result_inc + result_start] =
dot_prod;
1362 template<
typename NumericT>
1365 unsigned int A_start1,
unsigned int A_start2,
1366 unsigned int A_inc1,
unsigned int A_inc2,
1367 unsigned int A_size1,
unsigned int A_size2,
1368 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1371 unsigned int options2,
1373 const NumericT * vec1,
1378 const NumericT * vec2,
1383 NumericT alpha = val;
1384 if (options2 & (1 << 0))
1386 if (options2 & (1 << 1))
1387 alpha = NumericT(1) / alpha;
1389 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1390 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1392 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1394 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1395 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1396 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
1402 template<
typename NumericT>
1405 unsigned int A_start1,
unsigned int A_start2,
1406 unsigned int A_inc1,
unsigned int A_inc2,
1407 unsigned int A_size1,
unsigned int A_size2,
1408 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1410 const NumericT * val,
1411 unsigned int options2,
1413 const NumericT * vec1,
1418 const NumericT * vec2,
1423 NumericT alpha = *val;
1424 if (options2 & (1 << 0))
1426 if (options2 & (1 << 1))
1427 alpha = NumericT(1) / alpha;
1429 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1430 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1432 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1434 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1435 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1436 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
result_of::size_type< T >::type start2(T const &obj)
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)