1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
50 template<
typename NumericT>
56 const NumericT * fac2,
57 unsigned int options2,
58 const NumericT * vec2,
62 NumericT alpha = *fac2;
63 if (options2 & (1 << 0))
66 if (options2 & (1 << 1))
68 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
70 i += gridDim.x * blockDim.x)
75 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
77 i += gridDim.x * blockDim.x)
83 template<
typename NumericT>
90 unsigned int options2,
91 const NumericT * vec2,
95 NumericT alpha = fac2;
96 if (options2 & (1 << 0))
99 if (options2 & (1 << 1))
101 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
103 i += gridDim.x * blockDim.x)
108 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
110 i += gridDim.x * blockDim.x)
117 template<
typename NumericT,
typename ScalarType1>
121 typedef NumericT value_type;
123 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
125 value_type data_alpha = alpha;
127 data_alpha = -data_alpha;
128 if (reciprocal_alpha)
129 data_alpha =
static_cast<value_type
>(1) / data_alpha;
131 value_type temporary_alpha = 0;
133 temporary_alpha = alpha;
135 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
142 detail::cuda_arg<value_type>(vec2),
152 template<
typename NumericT>
158 const NumericT * fac2,
159 unsigned int options2,
160 const NumericT * vec2,
164 const NumericT * fac3,
165 unsigned int options3,
166 const NumericT * vec3,
170 NumericT alpha = *fac2;
171 if (options2 & (1 << 0))
174 NumericT beta = *fac3;
175 if (options3 & (1 << 0))
178 if (options2 & (1 << 1))
180 if (options3 & (1 << 1))
182 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
184 i += gridDim.x * blockDim.x)
185 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
189 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
191 i += gridDim.x * blockDim.x)
192 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
197 if (options3 & (1 << 1))
199 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
201 i += gridDim.x * blockDim.x)
202 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
206 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
208 i += gridDim.x * blockDim.x)
209 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
215 template<
typename NumericT>
222 unsigned int options2,
223 const NumericT * vec2,
227 const NumericT * fac3,
228 unsigned int options3,
229 const NumericT * vec3,
233 NumericT alpha = fac2;
234 if (options2 & (1 << 0))
237 NumericT beta = *fac3;
238 if (options3 & (1 << 0))
241 if (options2 & (1 << 1))
243 if (options3 & (1 << 1))
245 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
247 i += gridDim.x * blockDim.x)
248 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
252 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
254 i += gridDim.x * blockDim.x)
255 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
260 if (options3 & (1 << 1))
262 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
264 i += gridDim.x * blockDim.x)
265 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
269 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
271 i += gridDim.x * blockDim.x)
272 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
278 template<
typename NumericT>
284 const NumericT * fac2,
285 unsigned int options2,
286 const NumericT * vec2,
291 unsigned int options3,
292 const NumericT * vec3,
296 NumericT alpha = *fac2;
297 if (options2 & (1 << 0))
300 NumericT beta = fac3;
301 if (options3 & (1 << 0))
304 if (options2 & (1 << 1))
306 if (options3 & (1 << 1))
308 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
310 i += gridDim.x * blockDim.x)
311 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
315 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
317 i += gridDim.x * blockDim.x)
318 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
323 if (options3 & (1 << 1))
325 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
327 i += gridDim.x * blockDim.x)
328 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
332 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
334 i += gridDim.x * blockDim.x)
335 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
341 template<
typename NumericT>
348 unsigned int options2,
349 const NumericT * vec2,
354 unsigned int options3,
355 const NumericT * vec3,
359 NumericT alpha = fac2;
360 if (options2 & (1 << 0))
363 NumericT beta = fac3;
364 if (options3 & (1 << 0))
367 if (options2 & (1 << 1))
369 if (options3 & (1 << 1))
371 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
373 i += gridDim.x * blockDim.x)
374 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
378 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
380 i += gridDim.x * blockDim.x)
381 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
386 if (options3 & (1 << 1))
388 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
390 i += gridDim.x * blockDim.x)
391 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
395 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
397 i += gridDim.x * blockDim.x)
398 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
406 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
411 typedef NumericT value_type;
413 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
415 value_type data_alpha = alpha;
417 data_alpha = -data_alpha;
418 if (reciprocal_alpha)
419 data_alpha =
static_cast<value_type
>(1) / data_alpha;
421 value_type temporary_alpha = 0;
423 temporary_alpha = alpha;
427 value_type temporary_beta = 0;
429 temporary_beta = beta;
432 avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
439 detail::cuda_arg<value_type>(vec2),
445 detail::cuda_arg<value_type>(vec3),
456 template<
typename NumericT>
462 const NumericT * fac2,
463 unsigned int options2,
464 const NumericT * vec2,
468 const NumericT * fac3,
469 unsigned int options3,
470 const NumericT * vec3,
474 NumericT alpha = *fac2;
475 if (options2 & (1 << 0))
478 NumericT beta = *fac3;
479 if (options3 & (1 << 0))
482 if (options2 & (1 << 1))
484 if (options3 & (1 << 1))
486 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
488 i += gridDim.x * blockDim.x)
489 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
493 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
495 i += gridDim.x * blockDim.x)
496 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
501 if (options3 & (1 << 1))
503 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
505 i += gridDim.x * blockDim.x)
506 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
510 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
512 i += gridDim.x * blockDim.x)
513 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
519 template<
typename NumericT>
526 unsigned int options2,
527 const NumericT * vec2,
531 const NumericT * fac3,
532 unsigned int options3,
533 const NumericT * vec3,
537 NumericT alpha = fac2;
538 if (options2 & (1 << 0))
541 NumericT beta = *fac3;
542 if (options3 & (1 << 0))
545 if (options2 & (1 << 1))
547 if (options3 & (1 << 1))
549 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
551 i += gridDim.x * blockDim.x)
552 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
556 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
558 i += gridDim.x * blockDim.x)
559 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
564 if (options3 & (1 << 1))
566 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
568 i += gridDim.x * blockDim.x)
569 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
573 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
575 i += gridDim.x * blockDim.x)
576 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
582 template<
typename NumericT>
588 const NumericT * fac2,
589 unsigned int options2,
590 const NumericT * vec2,
595 unsigned int options3,
596 const NumericT * vec3,
600 NumericT alpha = *fac2;
601 if (options2 & (1 << 0))
604 NumericT beta = fac3;
605 if (options3 & (1 << 0))
608 if (options2 & (1 << 1))
610 if (options3 & (1 << 1))
612 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
614 i += gridDim.x * blockDim.x)
615 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
619 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
621 i += gridDim.x * blockDim.x)
622 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
627 if (options3 & (1 << 1))
629 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
631 i += gridDim.x * blockDim.x)
632 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
636 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
638 i += gridDim.x * blockDim.x)
639 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
645 template<
typename NumericT>
652 unsigned int options2,
653 const NumericT * vec2,
658 unsigned int options3,
659 const NumericT * vec3,
663 NumericT alpha = fac2;
664 if (options2 & (1 << 0))
667 NumericT beta = fac3;
668 if (options3 & (1 << 0))
671 if (options2 & (1 << 1))
673 if (options3 & (1 << 1))
675 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
677 i += gridDim.x * blockDim.x)
678 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
682 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
684 i += gridDim.x * blockDim.x)
685 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
690 if (options3 & (1 << 1))
692 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
694 i += gridDim.x * blockDim.x)
695 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
699 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
701 i += gridDim.x * blockDim.x)
702 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
708 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
713 typedef NumericT value_type;
715 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
717 value_type data_alpha = alpha;
719 data_alpha = -data_alpha;
720 if (reciprocal_alpha)
721 data_alpha =
static_cast<value_type
>(1) / data_alpha;
723 value_type temporary_alpha = 0;
725 temporary_alpha = alpha;
729 value_type temporary_beta = 0;
731 temporary_beta = beta;
734 avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
741 detail::cuda_arg<value_type>(vec2),
747 detail::cuda_arg<value_type>(vec3),
755 template<
typename NumericT>
764 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
766 i += gridDim.x * blockDim.x)
776 template<
typename NumericT,
typename ScalarT1>
779 typedef NumericT value_type;
781 value_type temporary_alpha = 0;
783 temporary_alpha = alpha;
787 vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
799 template<
typename NumericT>
810 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
812 i += gridDim.x * blockDim.x)
814 tmp = vec2[i*inc2+
start2];
816 vec1[i*inc1+
start1] = tmp;
826 template<
typename NumericT>
829 typedef NumericT value_type;
831 vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
836 detail::cuda_arg<value_type>(vec2),
844 template<
typename NumericT>
850 NumericT
const * vec2,
854 NumericT
const * vec3,
863 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
865 i += gridDim.x * blockDim.x)
867 vec1[i*inc1+
start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
870 else if (op_type == 1)
872 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
874 i += gridDim.x * blockDim.x)
876 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
879 else if (op_type == 0)
881 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
883 i += gridDim.x * blockDim.x)
885 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
890 template<
typename NumericT>
896 NumericT
const * vec2,
900 NumericT
const * vec3,
909 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
911 i += gridDim.x * blockDim.x)
913 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
916 else if (op_type == 0)
918 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
920 i += gridDim.x * blockDim.x)
922 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
932 template<
typename NumericT,
typename OpT>
936 typedef NumericT value_type;
938 unsigned int op_type = 2;
944 element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
949 detail::cuda_arg<value_type>(proxy.lhs()),
953 detail::cuda_arg<value_type>(proxy.rhs()),
962 template<
typename OpT>
966 typedef float value_type;
968 unsigned int op_type = 2;
974 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
979 detail::cuda_arg<value_type>(proxy.lhs()),
983 detail::cuda_arg<value_type>(proxy.rhs()),
992 template<
typename OpT>
996 typedef double value_type;
998 unsigned int op_type = 2;
1004 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1009 detail::cuda_arg<value_type>(proxy.lhs()),
1013 detail::cuda_arg<value_type>(proxy.rhs()),
1028 template<
typename NumericT>
1030 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1031 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1033 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1034 vec1[i*inc1+
start1] = acos(vec2[i*inc2+start2]);
1037 template<
typename NumericT>
1041 typedef NumericT value_type;
1043 vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1047 detail::cuda_arg<value_type>(proxy.lhs()),
1055 template<
typename NumericT>
1057 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1058 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1060 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1061 vec1[i*inc1+
start1] = asin(vec2[i*inc2+start2]);
1064 template<
typename NumericT>
1068 typedef NumericT value_type;
1070 vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1074 detail::cuda_arg<value_type>(proxy.lhs()),
1083 template<
typename NumericT>
1085 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1086 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1088 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1089 vec1[i*inc1+
start1] = atan(vec2[i*inc2+start2]);
1092 template<
typename NumericT>
1096 typedef NumericT value_type;
1098 vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1102 detail::cuda_arg<value_type>(proxy.lhs()),
1111 template<
typename NumericT>
1113 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1114 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1116 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1117 vec1[i*inc1+
start1] = ceil(vec2[i*inc2+start2]);
1120 template<
typename NumericT>
1124 typedef NumericT value_type;
1126 vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1130 detail::cuda_arg<value_type>(proxy.lhs()),
1139 template<
typename NumericT>
1141 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1142 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1144 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1145 vec1[i*inc1+
start1] = cos(vec2[i*inc2+start2]);
1148 template<
typename NumericT>
1152 typedef NumericT value_type;
1154 vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1158 detail::cuda_arg<value_type>(proxy.lhs()),
1167 template<
typename NumericT>
1169 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1170 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1172 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1173 vec1[i*inc1+
start1] = cosh(vec2[i*inc2+start2]);
1176 template<
typename NumericT>
1180 typedef NumericT value_type;
1182 vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1186 detail::cuda_arg<value_type>(proxy.lhs()),
1195 template<
typename NumericT>
1197 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1198 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1200 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1201 vec1[i*inc1+
start1] = exp(vec2[i*inc2+start2]);
1204 template<
typename NumericT>
1208 typedef NumericT value_type;
1210 vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1214 detail::cuda_arg<value_type>(proxy.lhs()),
1223 template<
typename NumericT>
1225 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1226 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1228 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1229 vec1[i*inc1+
start1] = fabs(vec2[i*inc2+start2]);
1232 template<
typename NumericT>
1236 typedef NumericT value_type;
1238 vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1242 detail::cuda_arg<value_type>(proxy.lhs()),
1250 template<
typename NumericT>
1252 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1253 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1255 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1256 vec1[i*inc1+
start1] = abs(vec2[i*inc2+start2]);
1259 template<
typename NumericT>
1263 typedef NumericT value_type;
1265 vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1269 detail::cuda_arg<value_type>(proxy.lhs()),
1279 template<
typename NumericT>
1281 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1282 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1284 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1285 vec1[i*inc1+
start1] = floor(vec2[i*inc2+start2]);
1288 template<
typename NumericT>
1292 typedef NumericT value_type;
1294 vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1298 detail::cuda_arg<value_type>(proxy.lhs()),
1307 template<
typename NumericT>
1309 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1310 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1312 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1313 vec1[i*inc1+
start1] = log(vec2[i*inc2+start2]);
1316 template<
typename NumericT>
1320 typedef NumericT value_type;
1322 vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1326 detail::cuda_arg<value_type>(proxy.lhs()),
1335 template<
typename NumericT>
1337 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1338 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1340 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1341 vec1[i*inc1+
start1] = log10(vec2[i*inc2+start2]);
1344 template<
typename NumericT>
1348 typedef NumericT value_type;
1350 vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1354 detail::cuda_arg<value_type>(proxy.lhs()),
1363 template<
typename NumericT>
1365 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1366 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1368 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1369 vec1[i*inc1+
start1] = sin(vec2[i*inc2+start2]);
1372 template<
typename NumericT>
1376 typedef NumericT value_type;
1378 vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1382 detail::cuda_arg<value_type>(proxy.lhs()),
1391 template<
typename NumericT>
1393 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1394 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1396 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1397 vec1[i*inc1+
start1] = sinh(vec2[i*inc2+start2]);
1400 template<
typename NumericT>
1404 typedef NumericT value_type;
1406 vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1410 detail::cuda_arg<value_type>(proxy.lhs()),
1419 template<
typename NumericT>
1421 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1422 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1424 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1425 vec1[i*inc1+
start1] = sqrt(vec2[i*inc2+start2]);
1428 template<
typename NumericT>
1432 typedef NumericT value_type;
1434 vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1438 detail::cuda_arg<value_type>(proxy.lhs()),
1447 template<
typename NumericT>
1449 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1450 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1452 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1453 vec1[i*inc1+
start1] = tan(vec2[i*inc2+start2]);
1456 template<
typename NumericT>
1460 typedef NumericT value_type;
1462 vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1466 detail::cuda_arg<value_type>(proxy.lhs()),
1475 template<
typename NumericT>
1477 NumericT * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1478 NumericT
const * vec2,
unsigned int start2,
unsigned int inc2)
1480 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1481 vec1[i*inc1+
start1] = tanh(vec2[i*inc2+start2]);
1484 template<
typename NumericT>
1488 typedef NumericT value_type;
1490 vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1494 detail::cuda_arg<value_type>(proxy.lhs()),
1506 template<
typename NumericT>
1511 const NumericT * vec2,
1515 NumericT * group_buffer)
1517 __shared__ NumericT tmp_buffer[128];
1518 unsigned int group_start1 = (blockIdx.x *
size1) / (gridDim.x) * inc1 +
start1;
1519 unsigned int group_start2 = (blockIdx.x *
size2) / (gridDim.x) * inc2 +
start2;
1521 unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1522 - ( blockIdx.x * size1) / (gridDim.x);
1526 for (
unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1527 tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1528 tmp_buffer[threadIdx.x] = tmp;
1534 if (threadIdx.x <
stride)
1535 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
1538 if (threadIdx.x == 0)
1539 group_buffer[blockIdx.x] = tmp_buffer[0];
1546 template<
typename NumericT>
1548 const NumericT * vec1,
1552 unsigned int option,
1555 __shared__ NumericT tmp_buffer[128];
1556 NumericT thread_sum = 0;
1557 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1560 thread_sum += vec1[i*inc1+
start1];
1562 thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1565 tmp_buffer[threadIdx.x] = thread_sum;
1570 if (threadIdx.x <
stride)
1573 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1575 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x +
stride]);
1579 if (threadIdx.x == 0)
1582 *result = sqrt(tmp_buffer[0]);
1584 *result = tmp_buffer[0];
1588 template<
typename NumericT>
1590 const NumericT * vec1,
1594 unsigned int option,
1597 __shared__ NumericT tmp_buffer[128];
1598 NumericT thread_sum = 0;
1599 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1602 thread_sum += vec1[i*inc1+
start1];
1604 thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1607 tmp_buffer[threadIdx.x] = thread_sum;
1612 if (threadIdx.x <
stride)
1615 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1617 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1621 if (threadIdx.x == 0)
1622 *result = tmp_buffer[0];
1625 template<
typename NumericT>
1627 const NumericT * vec1,
1631 unsigned int option,
1634 __shared__ NumericT tmp_buffer[128];
1635 NumericT thread_sum = 0;
1636 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1639 thread_sum += vec1[i*inc1+
start1];
1641 thread_sum = (thread_sum > vec1[i*inc1+
start1]) ? thread_sum : vec1[i*inc1+start1];
1644 tmp_buffer[threadIdx.x] = thread_sum;
1649 if (threadIdx.x <
stride)
1652 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1654 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1658 if (threadIdx.x == 0)
1659 *result = tmp_buffer[0];
1665 struct vector_sum_kernel_launcher_integers
1667 template<
typename NumericT,
typename ScalarT>
1669 unsigned int option,
1672 typedef NumericT value_type;
1673 vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1677 static_cast<unsigned int>(option),
1678 detail::cuda_arg<value_type>(result) );
1683 struct vector_sum_kernel_launcher_unsigned_integers
1685 template<
typename NumericT,
typename ScalarT>
1686 static void apply(vector_base<NumericT>
const & temp,
1687 unsigned int option,
1690 typedef NumericT value_type;
1691 vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1695 static_cast<unsigned int>(option),
1696 detail::cuda_arg<value_type>(result) );
1701 struct vector_sum_kernel_launcher_floats
1703 template<
typename NumericT,
typename ScalarT>
1704 static void apply(vector_base<NumericT>
const & temp,
1705 unsigned int option,
1708 typedef NumericT value_type;
1709 vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1713 static_cast<unsigned int>(option),
1714 detail::cuda_arg<value_type>(result) );
1719 template<
typename NumericT>
1720 struct vector_sum_kernel_launcher :
public vector_sum_kernel_launcher_integers {};
1723 struct vector_sum_kernel_launcher<unsigned char> :
public vector_sum_kernel_launcher_unsigned_integers {};
1726 struct vector_sum_kernel_launcher<unsigned short> :
public vector_sum_kernel_launcher_unsigned_integers {};
1729 struct vector_sum_kernel_launcher<unsigned int> :
public vector_sum_kernel_launcher_unsigned_integers {};
1732 struct vector_sum_kernel_launcher<unsigned long> :
public vector_sum_kernel_launcher_unsigned_integers {};
1735 struct vector_sum_kernel_launcher<float> :
public vector_sum_kernel_launcher_floats {};
1738 struct vector_sum_kernel_launcher<double> :
public vector_sum_kernel_launcher_floats {};
1752 template<
typename NumericT,
typename ScalarT>
1757 typedef NumericT value_type;
1759 static const unsigned int work_groups = 128;
1762 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1766 detail::cuda_arg<value_type>(vec2),
1770 detail::cuda_arg<value_type>(temp)
1774 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
1784 template<
typename NumericT>
1789 typedef NumericT value_type;
1791 const unsigned int work_groups = 128;
1794 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1798 detail::cuda_arg<value_type>(vec2),
1802 detail::cuda_arg<value_type>(temp)
1807 std::vector<value_type> temp_cpu(work_groups);
1811 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1817 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1818 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1820 template<
typename NumericT>
1821 __global__
void inner_prod_2_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1822 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1823 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1824 NumericT *group_results)
1827 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1828 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1829 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1831 NumericT entry_x = 0;
1832 NumericT group_sum0 = 0;
1833 NumericT group_sum1 = 0;
1834 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1835 entry_x = x[i * stridex + startx];
1836 group_sum0 += entry_x * y0[i * stride0 + start0];
1837 group_sum1 += entry_x * y1[i * stride1 +
start1];
1839 tmp_buffer[threadIdx.x] = group_sum0;
1840 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1845 if (threadIdx.x <
stride) {
1846 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1847 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1852 if (threadIdx.x == 0) {
1853 group_results[blockIdx.x] = tmp_buffer[0];
1854 group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1859 template<
typename NumericT>
1860 __global__
void inner_prod_3_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1861 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1862 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1863 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1864 NumericT *group_results)
1867 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1868 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1869 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1871 NumericT entry_x = 0;
1872 NumericT group_sum0 = 0;
1873 NumericT group_sum1 = 0;
1874 NumericT group_sum2 = 0;
1875 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1876 entry_x = x[i * stridex + startx];
1877 group_sum0 += entry_x * y0[i * stride0 + start0];
1878 group_sum1 += entry_x * y1[i * stride1 +
start1];
1879 group_sum2 += entry_x * y2[i * stride2 +
start2];
1881 tmp_buffer[threadIdx.x] = group_sum0;
1882 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1883 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1888 if (threadIdx.x <
stride) {
1889 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1890 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1891 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1896 if (threadIdx.x == 0) {
1897 group_results[blockIdx.x ] = tmp_buffer[0];
1898 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1899 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1904 template<
typename NumericT>
1905 __global__
void inner_prod_4_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1906 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1907 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1908 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1909 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1910 NumericT *group_results)
1913 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1914 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1915 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1917 NumericT entry_x = 0;
1918 NumericT group_sum0 = 0;
1919 NumericT group_sum1 = 0;
1920 NumericT group_sum2 = 0;
1921 NumericT group_sum3 = 0;
1922 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1923 entry_x = x[i * stridex + startx];
1924 group_sum0 += entry_x * y0[i * stride0 + start0];
1925 group_sum1 += entry_x * y1[i * stride1 +
start1];
1926 group_sum2 += entry_x * y2[i * stride2 +
start2];
1927 group_sum3 += entry_x * y3[i * stride3 + start3];
1929 tmp_buffer[threadIdx.x] = group_sum0;
1930 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1931 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1932 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1937 if (threadIdx.x <
stride) {
1938 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1939 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1940 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1941 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
1946 if (threadIdx.x == 0) {
1947 group_results[blockIdx.x ] = tmp_buffer[0];
1948 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1949 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1950 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1955 template<
typename NumericT>
1956 __global__
void inner_prod_8_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1957 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1958 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1959 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1960 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1961 const NumericT *y4,
unsigned int start4,
unsigned int stride4,
1962 const NumericT *y5,
unsigned int start5,
unsigned int stride5,
1963 const NumericT *y6,
unsigned int start6,
unsigned int stride6,
1964 const NumericT *y7,
unsigned int start7,
unsigned int stride7,
1965 NumericT *group_results)
1968 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1969 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1970 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1972 NumericT entry_x = 0;
1973 NumericT group_sum0 = 0;
1974 NumericT group_sum1 = 0;
1975 NumericT group_sum2 = 0;
1976 NumericT group_sum3 = 0;
1977 NumericT group_sum4 = 0;
1978 NumericT group_sum5 = 0;
1979 NumericT group_sum6 = 0;
1980 NumericT group_sum7 = 0;
1981 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1982 entry_x = x[i * stridex + startx];
1983 group_sum0 += entry_x * y0[i * stride0 + start0];
1984 group_sum1 += entry_x * y1[i * stride1 +
start1];
1985 group_sum2 += entry_x * y2[i * stride2 +
start2];
1986 group_sum3 += entry_x * y3[i * stride3 + start3];
1987 group_sum4 += entry_x * y4[i * stride4 + start4];
1988 group_sum5 += entry_x * y5[i * stride5 + start5];
1989 group_sum6 += entry_x * y6[i * stride6 + start6];
1990 group_sum7 += entry_x * y7[i * stride7 + start7];
1992 tmp_buffer[threadIdx.x] = group_sum0;
1993 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1994 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1995 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1996 tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1997 tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1998 tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1999 tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
2004 if (threadIdx.x <
stride) {
2005 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
2006 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
2007 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
2008 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
2009 tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 4 * blockDim.x];
2010 tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 5 * blockDim.x];
2011 tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 6 * blockDim.x];
2012 tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 7 * blockDim.x];
2017 if (threadIdx.x == 0) {
2018 group_results[blockIdx.x ] = tmp_buffer[0];
2019 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2020 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2021 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2022 group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2023 group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2024 group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2025 group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2030 template<
typename NumericT>
2032 NumericT
const * vec1,
2034 unsigned int start_result,
2035 unsigned int inc_result)
2044 if (threadIdx.x <
stride)
2045 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
2048 if (threadIdx.x == 0)
2049 result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2052 template<
typename NumericT>
2057 typedef NumericT value_type;
2062 while (vec_tuple.
const_size() > current_index)
2064 switch (vec_tuple.
const_size() - current_index)
2081 detail::cuda_arg<value_type>(y0),
2084 detail::cuda_arg<value_type>(y1),
2087 detail::cuda_arg<value_type>(y2),
2090 detail::cuda_arg<value_type>(y3),
2093 detail::cuda_arg<value_type>(temp)
2096 vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2097 detail::cuda_arg<value_type>(result),
2116 detail::cuda_arg<value_type>(y0),
2119 detail::cuda_arg<value_type>(y1),
2122 detail::cuda_arg<value_type>(y2),
2125 detail::cuda_arg<value_type>(temp)
2128 vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2129 detail::cuda_arg<value_type>(result),
2147 detail::cuda_arg<value_type>(y0),
2150 detail::cuda_arg<value_type>(y1),
2153 detail::cuda_arg<value_type>(temp)
2156 vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2157 detail::cuda_arg<value_type>(result),
2168 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
2172 detail::cuda_arg<value_type>(y0),
2176 detail::cuda_arg<value_type>(temp)
2180 vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
2181 detail::cuda_arg<value_type>(result),
2206 detail::cuda_arg<value_type>(y0),
2209 detail::cuda_arg<value_type>(y1),
2212 detail::cuda_arg<value_type>(y2),
2215 detail::cuda_arg<value_type>(y3),
2218 detail::cuda_arg<value_type>(y4),
2221 detail::cuda_arg<value_type>(y5),
2224 detail::cuda_arg<value_type>(y6),
2227 detail::cuda_arg<value_type>(y7),
2230 detail::cuda_arg<value_type>(temp)
2233 vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2234 detail::cuda_arg<value_type>(result),
2246 #undef VIENNACL_MDOT_WORKGROUP_NUM
2247 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2251 template<
typename NumericT>
2253 const NumericT * vec,
2257 unsigned int norm_selector,
2258 NumericT * group_buffer)
2260 __shared__ NumericT tmp_buffer[128];
2262 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2263 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2264 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2265 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2266 group_stop = (group_stop > size1) ? size1 : group_stop;
2268 if (norm_selector == 1)
2270 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2271 tmp += fabs(vec[i*inc1 + start1]);
2273 else if (norm_selector == 2)
2275 NumericT vec_entry = 0;
2276 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2278 vec_entry = vec[i*inc1 +
start1];
2279 tmp += vec_entry * vec_entry;
2282 else if (norm_selector == 0)
2284 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2285 tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2287 else if (norm_selector == 3)
2289 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2290 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2292 else if (norm_selector == 4)
2294 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2295 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2298 tmp_buffer[threadIdx.x] = tmp;
2300 if (norm_selector == 1 || norm_selector == 2)
2305 if (threadIdx.x <
stride)
2306 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2309 else if (norm_selector == 3)
2315 if (threadIdx.x <
stride)
2316 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2319 else if (norm_selector == 4)
2325 if (threadIdx.x <
stride)
2326 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2335 if (threadIdx.x <
stride)
2336 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+
stride]);
2340 if (threadIdx.x == 0)
2341 group_buffer[blockIdx.x] = tmp_buffer[0];
2344 template<
typename NumericT>
2346 const NumericT * vec,
2350 unsigned int norm_selector,
2351 NumericT * group_buffer)
2353 __shared__ NumericT tmp_buffer[128];
2355 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2356 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2357 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2358 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2359 group_stop = (group_stop > size1) ? size1 : group_stop;
2361 if (norm_selector == 1)
2363 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2364 tmp += abs(vec[i*inc1 + start1]);
2366 else if (norm_selector == 0)
2368 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2369 tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2371 else if (norm_selector == 3)
2373 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2374 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2376 else if (norm_selector == 4)
2378 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2379 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2382 tmp_buffer[threadIdx.x] = tmp;
2384 if (norm_selector == 1 || norm_selector == 2)
2389 if (threadIdx.x <
stride)
2390 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2393 else if (norm_selector == 3)
2399 if (threadIdx.x <
stride)
2400 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2403 else if (norm_selector == 4)
2409 if (threadIdx.x <
stride)
2410 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2419 if (threadIdx.x <
stride)
2420 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2424 if (threadIdx.x == 0)
2425 group_buffer[blockIdx.x] = tmp_buffer[0];
2428 template<
typename NumericT>
2430 const NumericT * vec,
2434 unsigned int norm_selector,
2435 NumericT * group_buffer)
2437 __shared__ NumericT tmp_buffer[128];
2439 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2440 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2441 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2442 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2443 group_stop = (group_stop > size1) ? size1 : group_stop;
2445 if (norm_selector == 1)
2447 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2448 tmp += vec[i*inc1 +
start1];
2450 else if (norm_selector == 0)
2452 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2453 tmp = (tmp > vec[i*inc1 +
start1]) ? tmp : vec[i*inc1 + start1];
2455 else if (norm_selector == 3)
2457 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2458 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2460 else if (norm_selector == 4)
2462 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2463 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2466 tmp_buffer[threadIdx.x] = tmp;
2468 if (norm_selector == 1 || norm_selector == 2)
2473 if (threadIdx.x <
stride)
2474 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2477 else if (norm_selector == 3)
2483 if (threadIdx.x <
stride)
2484 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2487 else if (norm_selector == 4)
2493 if (threadIdx.x <
stride)
2494 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2503 if (threadIdx.x <
stride)
2504 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2508 if (threadIdx.x == 0)
2509 group_buffer[blockIdx.x] = tmp_buffer[0];
2515 struct norm_kernel_launcher_integers
2517 template<
typename NumericT>
2520 unsigned int option)
2522 typedef NumericT value_type;
2523 norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2527 static_cast<unsigned int>(option),
2528 detail::cuda_arg<value_type>(temp)
2534 struct norm_kernel_launcher_unsigned_integers
2536 template<
typename NumericT>
2537 static void apply(vector_base<NumericT>
const & vec1,
2538 vector_base<NumericT> & temp,
2539 unsigned int option)
2541 typedef NumericT value_type;
2542 norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2546 static_cast<unsigned int>(option),
2547 detail::cuda_arg<value_type>(temp)
2554 struct norm_kernel_launcher_floats
2556 template<
typename NumericT>
2557 static void apply(vector_base<NumericT>
const & vec1,
2558 vector_base<NumericT> & temp,
2559 unsigned int option)
2561 typedef NumericT value_type;
2562 norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2566 static_cast<unsigned int>(option),
2567 detail::cuda_arg<value_type>(temp)
2573 template<
typename NumericT>
2574 struct norm_kernel_launcher :
public norm_kernel_launcher_integers {};
2577 struct norm_kernel_launcher<unsigned char> :
public norm_kernel_launcher_unsigned_integers {};
2580 struct norm_kernel_launcher<unsigned short> :
public norm_kernel_launcher_unsigned_integers {};
2583 struct norm_kernel_launcher<unsigned int> :
public norm_kernel_launcher_unsigned_integers {};
2586 struct norm_kernel_launcher<unsigned long> :
public norm_kernel_launcher_unsigned_integers {};
2589 struct norm_kernel_launcher<float> :
public norm_kernel_launcher_floats {};
2592 struct norm_kernel_launcher<double> :
public norm_kernel_launcher_floats {};
2603 template<
typename NumericT>
2607 typedef NumericT value_type;
2612 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2613 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
2621 template<
typename NumericT>
2625 typedef NumericT value_type;
2630 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2633 std::vector<value_type> temp_cpu(work_groups);
2637 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2648 template<
typename NumericT>
2652 typedef NumericT value_type;
2657 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2659 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 2, result);
2667 template<
typename NumericT>
2671 typedef NumericT value_type;
2676 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2678 std::vector<value_type> temp_cpu(work_groups);
2682 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2684 result = std::sqrt(result);
2695 template<
typename NumericT>
2699 typedef NumericT value_type;
2704 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2705 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 0, result);
2715 template<
typename NumericT>
2719 typedef NumericT value_type;
2724 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2726 std::vector<value_type> temp_cpu(work_groups);
2730 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2738 template<
typename NumericT>
2740 const NumericT * vec1,
2744 unsigned int option,
2747 __shared__ NumericT tmp_buffer[128];
2748 NumericT thread_minmax = vec1[
start1];
2749 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
2752 thread_minmax = (vec1[i*inc1+
start1] < thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2754 thread_minmax = (vec1[i*inc1+
start1] > thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2757 tmp_buffer[threadIdx.x] = thread_minmax;
2762 if (threadIdx.x <
stride)
2765 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x +
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x +
stride] : tmp_buffer[threadIdx.x];
2767 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x +
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x +
stride] : tmp_buffer[threadIdx.x];
2771 if (threadIdx.x == 0)
2772 *result = tmp_buffer[0];
2781 template<
typename NumericT>
2785 typedef NumericT value_type;
2790 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2792 vector_maxmin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2796 static_cast<unsigned int>(0),
2797 detail::cuda_arg<value_type>(result)
2809 template<
typename NumericT>
2813 typedef NumericT value_type;
2818 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2820 std::vector<value_type> temp_cpu(work_groups);
2824 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2835 template<
typename NumericT>
2839 typedef NumericT value_type;
2844 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2846 vector_maxmin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2850 static_cast<unsigned int>(1),
2851 detail::cuda_arg<value_type>(result)
2863 template<
typename NumericT>
2867 typedef NumericT value_type;
2872 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2874 std::vector<value_type> temp_cpu(work_groups);
2878 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2892 template<
typename NumericT>
2893 __device__ NumericT
cuda_abs(NumericT val) {
return (val < 0) ? -val : val; }
2894 __device__
inline unsigned long cuda_abs(
unsigned long val) {
return val; }
2895 __device__
inline unsigned int cuda_abs(
unsigned int val) {
return val; }
2896 __device__
inline unsigned short cuda_abs(
unsigned short val) {
return val; }
2897 __device__
inline unsigned char cuda_abs(
unsigned char val) {
return val; }
2899 template<
typename NumericT>
2904 unsigned int * result)
2906 __shared__ NumericT float_buffer[128];
2907 __shared__
unsigned int index_buffer[128];
2909 float_buffer[threadIdx.x] = 0;
2910 index_buffer[threadIdx.x] = 0;
2913 NumericT cur_max = NumericT(0);
2915 for (
unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2917 tmp = vec[i*inc1+
start1];
2921 float_buffer[threadIdx.x] = tmp;
2922 index_buffer[threadIdx.x] = i;
2931 if (threadIdx.x <
stride)
2934 if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+
stride])
2936 index_buffer[threadIdx.x] = index_buffer[threadIdx.x+
stride];
2937 float_buffer[threadIdx.x] = float_buffer[threadIdx.x+
stride];
2942 if (threadIdx.x == 0)
2943 *result = index_buffer[0];
2954 template<
typename NumericT>
2957 typedef NumericT value_type;
2962 index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
2967 reinterpret_cast<unsigned int *>(h.cuda_handle().get())
2971 unsigned int ret = 0;
2978 template<
typename NumericT>
2994 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
2996 tmp1 = vec1[i*inc1+
start1];
2997 tmp2 = vec2[i*inc2+
start2];
2999 vec1[i*inc1+
start1] = alpha * tmp1 + beta * tmp2;
3000 vec2[i*inc2+
start2] = alpha * tmp2 - beta * tmp1;
3014 template<
typename NumericT>
3017 NumericT alpha, NumericT beta)
3019 typedef NumericT value_type;
3021 value_type temporary_alpha = 0;
3023 temporary_alpha = alpha;
3025 value_type temporary_beta = 0;
3027 temporary_beta = beta;
3029 plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
3033 detail::cuda_arg<value_type>(vec2),
vcl_size_t const_size() const
__global__ void vec_element_abs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
void vector_assign(vector_base< NumericT > &vec1, ScalarT1 const &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
__global__ void vector_sum_kernel_unsigned_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void norm_2_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^2-norm of a vector - implementation.
__global__ void norm_kernel_floats(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
__global__ void vec_element_asin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Generic size and resize functionality for different vector and matrix types.
__global__ void plane_rotation_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT alpha, NumericT beta)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
__global__ void vector_multi_sum_kernel(NumericT const *vec1, NumericT *result, unsigned int start_result, unsigned int inc_result)
__global__ void vector_maxmin_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_fabs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
void max_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
__global__ void norm_kernel_unsigned_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
void norm_1_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^1-norm of a vector.
T max(const T &lhs, const T &rhs)
Maximum.
#define VIENNACL_MDOT_WORKGROUP_SIZE
An expression template class that represents a binary operation that yields a vector.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void vec_element_sqrt_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void avbv_v_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
void max_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void vec_element_tanh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
#define VIENNACL_MDOT_WORKGROUP_NUM
void vector_swap(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
Swaps the contents of two vectors, data is copied.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__global__ void index_norm_inf_kernel(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
__global__ void inner_prod_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT *group_buffer)
__global__ void vec_element_tan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
void norm_inf_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the supremum-norm of a vector.
void inner_prod_cpu(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, NumericT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
result_of::size_type< T >::type start(T const &obj)
__global__ void vec_element_atan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_floats(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void avbv_v(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void vec_element_log_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_cosh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
__global__ void av_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_acos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_ceil_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void min_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void norm_kernel_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
void norm_1_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^1-norm of a vector.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vcl_size_t index_norm_inf(vector_base< NumericT > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
__global__ void element_op_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
__global__ void vec_element_exp_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void plane_rotation(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2, NumericT alpha, NumericT beta)
Computes a plane rotation of two vectors.
void min_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
void inner_prod_impl(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, ScalarT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
__global__ void vector_swap_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2)
void norm_inf_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the supremum-norm of a vector.
__global__ void avbv_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
__global__ void vec_element_sin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
VectorType const & const_at(vcl_size_t i) const
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
__global__ void vec_element_cos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_assign_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, NumericT alpha)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
__global__ void vec_element_log10_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
T min(const T &lhs, const T &rhs)
Minimum.
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
__global__ void vec_element_floor_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Implementation of the ViennaCL scalar class.
void norm_2_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^2-norm of a vector - implementation.
void avbv(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
__global__ void vec_element_sinh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
__device__ NumericT cuda_abs(NumericT val)
Simple enable-if variant that uses the SFINAE pattern.
NumericT min(std::vector< NumericT > const &v1)
__global__ void element_op_int_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)