1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
33 inline __device__
unsigned int row_major_index(
unsigned int row,
unsigned int col,
unsigned int ,
unsigned int num_cols)
35 return row*num_cols+col;
37 inline __device__
unsigned int col_major_index(
unsigned int row,
unsigned int col,
unsigned int num_rows,
unsigned int )
39 return row+col*num_rows;
42 template<
typename NumericT>
45 unsigned int A_start1,
unsigned int A_start2,
46 unsigned int A_internal_size1,
unsigned int A_internal_size2,
47 unsigned int A_size1,
unsigned int A_size2,
48 unsigned int A_stride1,
unsigned int A_stride2,
51 unsigned int B_start1,
unsigned int B_start2,
52 unsigned int B_internal_size1,
unsigned int B_internal_size2,
53 unsigned int B_stride1,
unsigned int B_stride2,
56 unsigned int size = A_internal_size2*A_internal_size1;
57 for(
unsigned int i = blockIdx.x; i<size/gridDim.x; i+=gridDim.x)
59 unsigned int matrix_index = i * blockDim.x + threadIdx.x;
61 unsigned int row = matrix_index / A_internal_size2;
62 unsigned int col = matrix_index % A_internal_size2;
63 if (row < A_size1 && col < A_size2)
68 A_start2 + A_stride2 * col,
69 A_internal_size1, A_internal_size2);
71 B_start1 + B_stride1 * row,
72 B_internal_size1, B_internal_size2);
78 A_start2 + A_stride2 * col,
79 A_internal_size1, A_internal_size2);
81 B_start1 + B_stride1 * row,
82 B_internal_size1, B_internal_size2);
95 template<
typename NumericT>
98 unsigned int A_start1,
unsigned int A_start2,
99 unsigned int A_inc1,
unsigned int A_inc2,
100 unsigned int A_size1,
unsigned int A_size2,
101 unsigned int A_internal_size1,
unsigned int A_internal_size2,
104 unsigned int options2,
106 unsigned int B_start1,
unsigned int B_start2,
107 unsigned int B_inc1,
unsigned int B_inc2,
108 unsigned int B_internal_size1,
unsigned int B_internal_size2)
110 NumericT alpha = fac2;
111 if (options2 & (1 << 0))
114 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
115 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
117 if (options2 & (1 << 1))
119 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
120 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
121 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
125 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
126 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
127 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
132 template<
typename NumericT>
135 unsigned int A_start1,
unsigned int A_start2,
136 unsigned int A_inc1,
unsigned int A_inc2,
137 unsigned int A_size1,
unsigned int A_size2,
138 unsigned int A_internal_size1,
unsigned int A_internal_size2,
140 const NumericT * fac2,
141 unsigned int options2,
143 unsigned int B_start1,
unsigned int B_start2,
144 unsigned int B_inc1,
unsigned int B_inc2,
145 unsigned int B_internal_size1,
unsigned int B_internal_size2)
147 NumericT alpha = *fac2;
148 if (options2 & (1 << 0))
151 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
152 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
154 if (options2 & (1 << 1))
156 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
157 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
158 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
162 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
163 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
164 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
174 template<
typename NumericT>
177 unsigned int A_start1,
unsigned int A_start2,
178 unsigned int A_inc1,
unsigned int A_inc2,
179 unsigned int A_size1,
unsigned int A_size2,
180 unsigned int A_internal_size1,
unsigned int A_internal_size2,
183 unsigned int options2,
185 unsigned int B_start1,
unsigned int B_start2,
186 unsigned int B_inc1,
unsigned int B_inc2,
187 unsigned int B_internal_size1,
unsigned int B_internal_size2,
190 unsigned int options3,
192 unsigned int C_start1,
unsigned int C_start2,
193 unsigned int C_inc1,
unsigned int C_inc2,
194 unsigned int C_internal_size1,
unsigned int C_internal_size2)
196 NumericT alpha = fac2;
197 if (options2 & (1 << 0))
200 NumericT beta = fac3;
201 if (options3 & (1 << 0))
204 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
205 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
207 if (options2 & (1 << 1))
209 if (options3 & (1 << 1))
211 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
212 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
213 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
214 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
215 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
219 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
220 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
221 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
222 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
223 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
228 if (options3 & (1 << 1))
230 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
231 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
232 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
233 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
234 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
238 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
239 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
240 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
241 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
242 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
249 template<
typename NumericT>
252 unsigned int A_start1,
unsigned int A_start2,
253 unsigned int A_inc1,
unsigned int A_inc2,
254 unsigned int A_size1,
unsigned int A_size2,
255 unsigned int A_internal_size1,
unsigned int A_internal_size2,
258 unsigned int options2,
260 unsigned int B_start1,
unsigned int B_start2,
261 unsigned int B_inc1,
unsigned int B_inc2,
262 unsigned int B_internal_size1,
unsigned int B_internal_size2,
264 const NumericT * fac3,
265 unsigned int options3,
267 unsigned int C_start1,
unsigned int C_start2,
268 unsigned int C_inc1,
unsigned int C_inc2,
269 unsigned int C_internal_size1,
unsigned int C_internal_size2)
271 NumericT alpha = fac2;
272 if (options2 & (1 << 0))
275 NumericT beta = *fac3;
276 if (options3 & (1 << 0))
279 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
280 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
282 if (options2 & (1 << 1))
284 if (options3 & (1 << 1))
286 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
287 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
288 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
289 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
290 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
294 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
295 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
296 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
297 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
298 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
303 if (options3 & (1 << 1))
305 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
306 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
307 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
308 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
309 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
313 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
314 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
315 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
316 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
317 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
323 template<
typename NumericT>
326 unsigned int A_start1,
unsigned int A_start2,
327 unsigned int A_inc1,
unsigned int A_inc2,
328 unsigned int A_size1,
unsigned int A_size2,
329 unsigned int A_internal_size1,
unsigned int A_internal_size2,
331 const NumericT * fac2,
332 unsigned int options2,
334 unsigned int B_start1,
unsigned int B_start2,
335 unsigned int B_inc1,
unsigned int B_inc2,
336 unsigned int B_internal_size1,
unsigned int B_internal_size2,
339 unsigned int options3,
341 unsigned int C_start1,
unsigned int C_start2,
342 unsigned int C_inc1,
unsigned int C_inc2,
343 unsigned int C_internal_size1,
unsigned int C_internal_size2)
345 NumericT alpha = *fac2;
346 if (options2 & (1 << 0))
349 NumericT beta = fac3;
350 if (options3 & (1 << 0))
353 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
354 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
356 if (options2 & (1 << 1))
358 if (options3 & (1 << 1))
360 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
361 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
362 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
363 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
364 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
368 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
369 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
370 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
371 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
372 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
377 if (options3 & (1 << 1))
379 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
380 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
381 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
382 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
383 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
387 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
388 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
389 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
390 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
391 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
398 template<
typename NumericT>
401 unsigned int A_start1,
unsigned int A_start2,
402 unsigned int A_inc1,
unsigned int A_inc2,
403 unsigned int A_size1,
unsigned int A_size2,
404 unsigned int A_internal_size1,
unsigned int A_internal_size2,
406 const NumericT * fac2,
407 unsigned int options2,
409 unsigned int B_start1,
unsigned int B_start2,
410 unsigned int B_inc1,
unsigned int B_inc2,
411 unsigned int B_internal_size1,
unsigned int B_internal_size2,
413 const NumericT * fac3,
414 unsigned int options3,
416 unsigned int C_start1,
unsigned int C_start2,
417 unsigned int C_inc1,
unsigned int C_inc2,
418 unsigned int C_internal_size1,
unsigned int C_internal_size2)
420 NumericT alpha = *fac2;
421 if (options2 & (1 << 0))
424 NumericT beta = *fac3;
425 if (options3 & (1 << 0))
428 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
429 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
431 if (options2 & (1 << 1))
433 if (options3 & (1 << 1))
435 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
436 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
437 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
438 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
439 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
443 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
444 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
445 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
446 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
447 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
452 if (options3 & (1 << 1))
454 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
455 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
456 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
457 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
458 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
462 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
463 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
464 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
465 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
466 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
477 template<
typename NumericT>
480 unsigned int A_start1,
unsigned int A_start2,
481 unsigned int A_inc1,
unsigned int A_inc2,
482 unsigned int A_size1,
unsigned int A_size2,
483 unsigned int A_internal_size1,
unsigned int A_internal_size2,
486 unsigned int options2,
488 unsigned int B_start1,
unsigned int B_start2,
489 unsigned int B_inc1,
unsigned int B_inc2,
490 unsigned int B_internal_size1,
unsigned int B_internal_size2,
493 unsigned int options3,
495 unsigned int C_start1,
unsigned int C_start2,
496 unsigned int C_inc1,
unsigned int C_inc2,
497 unsigned int C_internal_size1,
unsigned int C_internal_size2)
499 NumericT alpha = fac2;
500 if (options2 & (1 << 0))
503 NumericT beta = fac3;
504 if (options3 & (1 << 0))
507 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
508 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
510 if (options2 & (1 << 1))
512 if (options3 & (1 << 1))
514 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
515 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
516 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
517 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
518 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
522 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
523 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
524 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
525 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
526 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
531 if (options3 & (1 << 1))
533 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
534 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
535 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
536 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
537 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
541 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
542 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
543 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
544 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
545 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
552 template<
typename NumericT>
555 unsigned int A_start1,
unsigned int A_start2,
556 unsigned int A_inc1,
unsigned int A_inc2,
557 unsigned int A_size1,
unsigned int A_size2,
558 unsigned int A_internal_size1,
unsigned int A_internal_size2,
561 unsigned int options2,
563 unsigned int B_start1,
unsigned int B_start2,
564 unsigned int B_inc1,
unsigned int B_inc2,
565 unsigned int B_internal_size1,
unsigned int B_internal_size2,
567 const NumericT * fac3,
568 unsigned int options3,
570 unsigned int C_start1,
unsigned int C_start2,
571 unsigned int C_inc1,
unsigned int C_inc2,
572 unsigned int C_internal_size1,
unsigned int C_internal_size2)
574 NumericT alpha = fac2;
575 if (options2 & (1 << 0))
578 NumericT beta = *fac3;
579 if (options3 & (1 << 0))
582 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
583 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
585 if (options2 & (1 << 1))
587 if (options3 & (1 << 1))
589 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
590 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
591 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
592 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
593 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
597 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
598 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
599 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
600 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
601 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
606 if (options3 & (1 << 1))
608 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
609 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
610 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
611 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
612 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
616 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
617 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
618 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
619 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
620 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
626 template<
typename NumericT>
629 unsigned int A_start1,
unsigned int A_start2,
630 unsigned int A_inc1,
unsigned int A_inc2,
631 unsigned int A_size1,
unsigned int A_size2,
632 unsigned int A_internal_size1,
unsigned int A_internal_size2,
634 const NumericT * fac2,
635 unsigned int options2,
637 unsigned int B_start1,
unsigned int B_start2,
638 unsigned int B_inc1,
unsigned int B_inc2,
639 unsigned int B_internal_size1,
unsigned int B_internal_size2,
642 unsigned int options3,
644 unsigned int C_start1,
unsigned int C_start2,
645 unsigned int C_inc1,
unsigned int C_inc2,
646 unsigned int C_internal_size1,
unsigned int C_internal_size2)
648 NumericT alpha = *fac2;
649 if (options2 & (1 << 0))
652 NumericT beta = fac3;
653 if (options3 & (1 << 0))
656 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
657 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
659 if (options2 & (1 << 1))
661 if (options3 & (1 << 1))
663 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
664 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
665 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
666 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
667 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
671 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
672 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
673 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
674 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
675 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
680 if (options3 & (1 << 1))
682 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
683 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
684 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
685 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
686 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
690 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
691 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
692 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
693 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
694 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
701 template<
typename NumericT>
704 unsigned int A_start1,
unsigned int A_start2,
705 unsigned int A_inc1,
unsigned int A_inc2,
706 unsigned int A_size1,
unsigned int A_size2,
707 unsigned int A_internal_size1,
unsigned int A_internal_size2,
709 const NumericT * fac2,
710 unsigned int options2,
712 unsigned int B_start1,
unsigned int B_start2,
713 unsigned int B_inc1,
unsigned int B_inc2,
714 unsigned int B_internal_size1,
unsigned int B_internal_size2,
716 const NumericT * fac3,
717 unsigned int options3,
719 unsigned int C_start1,
unsigned int C_start2,
720 unsigned int C_inc1,
unsigned int C_inc2,
721 unsigned int C_internal_size1,
unsigned int C_internal_size2)
723 NumericT alpha = *fac2;
724 if (options2 & (1 << 0))
727 NumericT beta = *fac3;
728 if (options3 & (1 << 0))
731 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
732 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
734 if (options2 & (1 << 1))
736 if (options3 & (1 << 1))
738 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
739 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
740 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
741 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
742 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
746 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
747 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
748 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
749 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
750 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
755 if (options3 & (1 << 1))
757 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
758 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
759 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
760 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
761 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
765 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
766 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
767 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
768 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
769 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
778 template<
typename NumericT>
781 unsigned int A_start1,
unsigned int A_start2,
782 unsigned int A_inc1,
unsigned int A_inc2,
783 unsigned int A_size1,
unsigned int A_size2,
784 unsigned int A_internal_size1,
unsigned int A_internal_size2,
787 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
788 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
790 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
791 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
792 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
796 template<
typename NumericT>
799 unsigned int A_start1,
unsigned int A_start2,
800 unsigned int A_inc1,
unsigned int A_inc2,
801 unsigned int A_size1,
unsigned int A_size2,
802 unsigned int A_internal_size1,
unsigned int A_internal_size2,
805 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
807 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
808 A[(
row * A_inc1 + A_start1) * A_internal_size2 +
row * A_inc2 + A_start2] = alpha;
815 template<
typename NumericT>
818 unsigned int A_start1,
unsigned int A_start2,
819 unsigned int A_inc1,
unsigned int A_inc2,
820 unsigned int A_size1,
unsigned int A_size2,
821 unsigned int A_internal_size1,
unsigned int A_internal_size2,
824 unsigned int B_start1,
unsigned int B_start2,
825 unsigned int B_inc1,
unsigned int B_inc2,
826 unsigned int B_internal_size1,
unsigned int B_internal_size2,
829 unsigned int C_start1,
unsigned int C_start2,
830 unsigned int C_inc1,
unsigned int C_inc2,
831 unsigned int C_internal_size1,
unsigned int C_internal_size2,
833 unsigned int op_type)
835 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
836 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
840 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
841 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
842 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
843 = pow(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
844 C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
846 else if (op_type == 1)
848 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
849 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
850 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
851 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
852 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
854 else if (op_type == 0)
856 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
857 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
858 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
859 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
860 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
864 template<
typename NumericT>
867 unsigned int A_start1,
unsigned int A_start2,
868 unsigned int A_inc1,
unsigned int A_inc2,
869 unsigned int A_size1,
unsigned int A_size2,
870 unsigned int A_internal_size1,
unsigned int A_internal_size2,
873 unsigned int B_start1,
unsigned int B_start2,
874 unsigned int B_inc1,
unsigned int B_inc2,
875 unsigned int B_internal_size1,
unsigned int B_internal_size2,
878 unsigned int C_start1,
unsigned int C_start2,
879 unsigned int C_inc1,
unsigned int C_inc2,
880 unsigned int C_internal_size1,
unsigned int C_internal_size2,
882 unsigned int op_type)
884 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
885 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
889 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
890 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
891 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
892 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
893 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
895 else if (op_type == 0)
897 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
898 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
899 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
900 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
901 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
910 template<
typename NumericT>
913 unsigned int A_start1,
unsigned int A_start2,
914 unsigned int A_inc1,
unsigned int A_inc2,
915 unsigned int A_size1,
unsigned int A_size2,
916 unsigned int A_internal_size1,
unsigned int A_internal_size2,
919 unsigned int B_start1,
unsigned int B_start2,
920 unsigned int B_inc1,
unsigned int B_inc2,
921 unsigned int B_internal_size1,
unsigned int B_internal_size2)
923 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
924 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
926 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
927 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
928 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
933 template<
typename NumericT>
936 unsigned int A_start1,
unsigned int A_start2,
937 unsigned int A_inc1,
unsigned int A_inc2,
938 unsigned int A_size1,
unsigned int A_size2,
939 unsigned int A_internal_size1,
unsigned int A_internal_size2,
942 unsigned int B_start1,
unsigned int B_start2,
943 unsigned int B_inc1,
unsigned int B_inc2,
944 unsigned int B_internal_size1,
unsigned int B_internal_size2)
946 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
947 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
949 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
950 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
951 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
956 template<
typename NumericT>
959 unsigned int A_start1,
unsigned int A_start2,
960 unsigned int A_inc1,
unsigned int A_inc2,
961 unsigned int A_size1,
unsigned int A_size2,
962 unsigned int A_internal_size1,
unsigned int A_internal_size2,
965 unsigned int B_start1,
unsigned int B_start2,
966 unsigned int B_inc1,
unsigned int B_inc2,
967 unsigned int B_internal_size1,
unsigned int B_internal_size2)
969 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
970 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
972 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
973 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
974 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
979 template<
typename NumericT>
982 unsigned int A_start1,
unsigned int A_start2,
983 unsigned int A_inc1,
unsigned int A_inc2,
984 unsigned int A_size1,
unsigned int A_size2,
985 unsigned int A_internal_size1,
unsigned int A_internal_size2,
988 unsigned int B_start1,
unsigned int B_start2,
989 unsigned int B_inc1,
unsigned int B_inc2,
990 unsigned int B_internal_size1,
unsigned int B_internal_size2)
992 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
993 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
995 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
996 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
997 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1002 template<
typename NumericT>
1005 unsigned int A_start1,
unsigned int A_start2,
1006 unsigned int A_inc1,
unsigned int A_inc2,
1007 unsigned int A_size1,
unsigned int A_size2,
1008 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1011 unsigned int B_start1,
unsigned int B_start2,
1012 unsigned int B_inc1,
unsigned int B_inc2,
1013 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1015 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1016 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1018 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1019 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1020 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1025 template<
typename NumericT>
1028 unsigned int A_start1,
unsigned int A_start2,
1029 unsigned int A_inc1,
unsigned int A_inc2,
1030 unsigned int A_size1,
unsigned int A_size2,
1031 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1034 unsigned int B_start1,
unsigned int B_start2,
1035 unsigned int B_inc1,
unsigned int B_inc2,
1036 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1038 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1039 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1041 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1042 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1043 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1048 template<
typename NumericT>
1051 unsigned int A_start1,
unsigned int A_start2,
1052 unsigned int A_inc1,
unsigned int A_inc2,
1053 unsigned int A_size1,
unsigned int A_size2,
1054 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1057 unsigned int B_start1,
unsigned int B_start2,
1058 unsigned int B_inc1,
unsigned int B_inc2,
1059 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1061 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1062 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1064 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1065 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1066 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1071 template<
typename NumericT>
1074 unsigned int A_start1,
unsigned int A_start2,
1075 unsigned int A_inc1,
unsigned int A_inc2,
1076 unsigned int A_size1,
unsigned int A_size2,
1077 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1080 unsigned int B_start1,
unsigned int B_start2,
1081 unsigned int B_inc1,
unsigned int B_inc2,
1082 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1084 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1085 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1087 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1088 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1089 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1094 template<
typename NumericT>
1097 unsigned int A_start1,
unsigned int A_start2,
1098 unsigned int A_inc1,
unsigned int A_inc2,
1099 unsigned int A_size1,
unsigned int A_size2,
1100 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1103 unsigned int B_start1,
unsigned int B_start2,
1104 unsigned int B_inc1,
unsigned int B_inc2,
1105 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1107 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1108 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1110 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1111 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1112 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1117 template<
typename NumericT>
1120 unsigned int A_start1,
unsigned int A_start2,
1121 unsigned int A_inc1,
unsigned int A_inc2,
1122 unsigned int A_size1,
unsigned int A_size2,
1123 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1126 unsigned int B_start1,
unsigned int B_start2,
1127 unsigned int B_inc1,
unsigned int B_inc2,
1128 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1130 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1131 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1133 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1134 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1135 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1140 template<
typename NumericT>
1143 unsigned int A_start1,
unsigned int A_start2,
1144 unsigned int A_inc1,
unsigned int A_inc2,
1145 unsigned int A_size1,
unsigned int A_size2,
1146 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1149 unsigned int B_start1,
unsigned int B_start2,
1150 unsigned int B_inc1,
unsigned int B_inc2,
1151 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1153 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1154 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1156 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1157 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1158 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1163 template<
typename NumericT>
1166 unsigned int A_start1,
unsigned int A_start2,
1167 unsigned int A_inc1,
unsigned int A_inc2,
1168 unsigned int A_size1,
unsigned int A_size2,
1169 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1172 unsigned int B_start1,
unsigned int B_start2,
1173 unsigned int B_inc1,
unsigned int B_inc2,
1174 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1176 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1177 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1179 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1180 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1181 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1186 template<
typename NumericT>
1189 unsigned int A_start1,
unsigned int A_start2,
1190 unsigned int A_inc1,
unsigned int A_inc2,
1191 unsigned int A_size1,
unsigned int A_size2,
1192 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1195 unsigned int B_start1,
unsigned int B_start2,
1196 unsigned int B_inc1,
unsigned int B_inc2,
1197 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1199 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1200 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1202 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1203 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1204 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1209 template<
typename NumericT>
1212 unsigned int A_start1,
unsigned int A_start2,
1213 unsigned int A_inc1,
unsigned int A_inc2,
1214 unsigned int A_size1,
unsigned int A_size2,
1215 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1218 unsigned int B_start1,
unsigned int B_start2,
1219 unsigned int B_inc1,
unsigned int B_inc2,
1220 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1222 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1223 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1225 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1226 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1227 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1232 template<
typename NumericT>
1235 unsigned int A_start1,
unsigned int A_start2,
1236 unsigned int A_inc1,
unsigned int A_inc2,
1237 unsigned int A_size1,
unsigned int A_size2,
1238 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1241 unsigned int B_start1,
unsigned int B_start2,
1242 unsigned int B_inc1,
unsigned int B_inc2,
1243 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1245 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1246 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1248 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1249 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1250 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1255 template<
typename NumericT>
1258 unsigned int A_start1,
unsigned int A_start2,
1259 unsigned int A_inc1,
unsigned int A_inc2,
1260 unsigned int A_size1,
unsigned int A_size2,
1261 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1264 unsigned int B_start1,
unsigned int B_start2,
1265 unsigned int B_inc1,
unsigned int B_inc2,
1266 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1268 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1269 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1271 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1272 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1273 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1278 template<
typename NumericT>
1281 unsigned int A_start1,
unsigned int A_start2,
1282 unsigned int A_inc1,
unsigned int A_inc2,
1283 unsigned int A_size1,
unsigned int A_size2,
1284 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1287 unsigned int B_start1,
unsigned int B_start2,
1288 unsigned int B_inc1,
unsigned int B_inc2,
1289 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1291 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1292 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1294 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1295 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1296 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1305 template<
typename NumericT>
1308 unsigned int A_row_start,
1309 unsigned int A_col_start,
1310 unsigned int A_row_inc,
1311 unsigned int A_col_inc,
1312 unsigned int A_row_size,
1313 unsigned int A_col_size,
1314 unsigned int A_internal_rows,
1315 unsigned int A_internal_cols,
1317 unsigned int v_start,
1319 unsigned int v_size,
1321 unsigned int result_start,
1322 unsigned int result_inc,
1323 unsigned int result_size)
1325 __shared__ NumericT work[128];
1327 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1328 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1329 unsigned int lid = threadIdx.x;
1331 for (
unsigned int row = row_gid;
row < A_row_size;
row += gridDim.x)
1334 for (
unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1335 dot_prod += A[(
row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1341 work[lid] += work[lid+
stride];
1345 result[
row * result_inc + result_start] = work[0];
1350 template<
typename NumericT>
1353 unsigned int A_row_start,
1354 unsigned int A_col_start,
1355 unsigned int A_row_inc,
1356 unsigned int A_col_inc,
1357 unsigned int A_row_size,
1358 unsigned int A_col_size,
1359 unsigned int A_internal_rows,
1360 unsigned int A_internal_cols,
1362 unsigned int v_start,
1364 unsigned int v_size,
1366 unsigned int result_start,
1367 unsigned int result_inc,
1368 unsigned int result_size)
1370 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_col_size;
row += gridDim.x * blockDim.x)
1373 for (
unsigned int col = 0; col < A_row_size; ++col)
1374 dot_prod += A[(
row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1375 result[
row * result_inc + result_start] =
dot_prod;
1392 template<
typename NumericT>
1395 unsigned int A_start1,
unsigned int A_start2,
1396 unsigned int A_inc1,
unsigned int A_inc2,
1397 unsigned int A_size1,
unsigned int A_size2,
1398 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1401 unsigned int options2,
1403 const NumericT * vec1,
1408 const NumericT * vec2,
1413 NumericT alpha = val;
1414 if (options2 & (1 << 0))
1416 if (options2 & (1 << 1))
1417 alpha = NumericT(1) / alpha;
1419 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1420 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1422 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1424 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1425 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1426 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
1432 template<
typename NumericT>
1435 unsigned int A_start1,
unsigned int A_start2,
1436 unsigned int A_inc1,
unsigned int A_inc2,
1437 unsigned int A_size1,
unsigned int A_size2,
1438 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1440 const NumericT * val,
1441 unsigned int options2,
1443 const NumericT * vec1,
1448 const NumericT * vec2,
1453 NumericT alpha = *val;
1454 if (options2 & (1 << 0))
1456 if (options2 & (1 << 1))
1457 alpha = NumericT(1) / alpha;
1459 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1460 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1462 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1464 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1465 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1466 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__device__ unsigned int row_major_index(unsigned int row, unsigned int col, unsigned int, unsigned int num_cols)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__device__ unsigned int col_major_index(unsigned int row, unsigned int col, unsigned int num_rows, unsigned int)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)