1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
46 template<
typename NumericT>
48 const unsigned int * row_indices,
49 const unsigned int * column_indices,
50 const NumericT * elements,
55 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
57 row += gridDim.x * blockDim.x)
60 unsigned int row_end = row_indices[
row+1];
65 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
66 value =
max(value, fabs(elements[i]));
70 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
71 value += fabs(elements[i]);
75 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
76 value += elements[i] * elements[i];
81 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
83 if (column_indices[i] ==
row)
99 template<
typename NumericT,
unsigned int AligmentV>
104 csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
105 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
106 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
107 detail::cuda_arg<NumericT>(vec),
108 static_cast<unsigned int>(mat.
size1()),
109 static_cast<unsigned int>(info_selector)
117 template<
typename NumericT>
119 const unsigned int * row_indices,
120 const unsigned int * column_indices,
121 const NumericT * elements,
123 unsigned int start_x,
126 unsigned int start_result,
127 unsigned int inc_result,
128 unsigned int size_result)
130 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
132 row += gridDim.x * blockDim.x)
135 unsigned int row_end = row_indices[
row+1];
136 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
137 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138 result[
row * inc_result + start_result] =
dot_prod;
154 template<
class NumericT,
unsigned int AlignmentV>
159 compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
160 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
161 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
162 detail::cuda_arg<NumericT>(vec),
163 static_cast<unsigned int>(vec.
start()),
164 static_cast<unsigned int>(vec.
stride()),
165 detail::cuda_arg<NumericT>(result),
166 static_cast<unsigned int>(result.
start()),
167 static_cast<unsigned int>(result.
stride()),
168 static_cast<unsigned int>(result.
size())
177 template<
typename LayoutT>
180 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
181 unsigned int row_start,
unsigned int row_inc,
182 unsigned int col_start,
unsigned int col_inc,
183 unsigned int internal_rows,
unsigned int internal_cols)
185 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
193 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
194 unsigned int row_start,
unsigned int row_inc,
195 unsigned int col_start,
unsigned int col_inc,
196 unsigned int internal_rows,
unsigned int internal_cols)
198 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
204 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
206 const unsigned int * sp_mat_row_indices,
207 const unsigned int * sp_mat_col_indices,
208 const NumericT * sp_mat_elements,
209 const NumericT * d_mat,
210 unsigned int d_mat_row_start,
211 unsigned int d_mat_col_start,
212 unsigned int d_mat_row_inc,
213 unsigned int d_mat_col_inc,
214 unsigned int d_mat_row_size,
215 unsigned int d_mat_col_size,
216 unsigned int d_mat_internal_rows,
217 unsigned int d_mat_internal_cols,
219 unsigned int result_row_start,
220 unsigned int result_col_start,
221 unsigned int result_row_inc,
222 unsigned int result_col_inc,
223 unsigned int result_row_size,
224 unsigned int result_col_size,
225 unsigned int result_internal_rows,
226 unsigned int result_internal_cols)
228 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
230 unsigned int row_start = sp_mat_row_indices[
row];
231 unsigned int row_end = sp_mat_row_indices[
row+1];
233 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
237 for (
unsigned int k = row_start; k < row_end; k++)
239 unsigned int j = sp_mat_col_indices[k];
240 NumericT x = sp_mat_elements[k];
241 NumericT y = d_mat[ DMatIndexT::apply(j, col,
242 d_mat_row_start, d_mat_row_inc,
243 d_mat_col_start, d_mat_col_inc,
244 d_mat_internal_rows, d_mat_internal_cols) ];
249 result[ResultIndexT::apply(
row, col,
250 result_row_start, result_row_inc,
251 result_col_start, result_col_inc,
252 result_internal_rows, result_internal_cols)] = r;
266 template<
typename NumericT,
unsigned int AlignmentV>
274 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
275 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
276 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
278 detail::cuda_arg<NumericT>(d_mat),
284 detail::cuda_arg<NumericT>(result),
295 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
296 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
297 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
299 detail::cuda_arg<NumericT>(d_mat),
305 detail::cuda_arg<NumericT>(result),
316 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
317 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
318 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
320 detail::cuda_arg<NumericT>(d_mat),
326 detail::cuda_arg<NumericT>(result),
337 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
338 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
339 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
341 detail::cuda_arg<NumericT>(d_mat),
347 detail::cuda_arg<NumericT>(result),
358 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
360 const unsigned int * sp_mat_row_indices,
361 const unsigned int * sp_mat_col_indices,
362 const NumericT * sp_mat_elements,
363 const NumericT * d_mat,
364 unsigned int d_mat_row_start,
365 unsigned int d_mat_col_start,
366 unsigned int d_mat_row_inc,
367 unsigned int d_mat_col_inc,
368 unsigned int d_mat_row_size,
369 unsigned int d_mat_col_size,
370 unsigned int d_mat_internal_rows,
371 unsigned int d_mat_internal_cols,
373 unsigned int result_row_start,
374 unsigned int result_col_start,
375 unsigned int result_row_inc,
376 unsigned int result_col_inc,
377 unsigned int result_row_size,
378 unsigned int result_col_size,
379 unsigned int result_internal_rows,
380 unsigned int result_internal_cols)
382 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
384 unsigned int row_start = sp_mat_row_indices[
row];
385 unsigned int row_end = sp_mat_row_indices[
row+1];
387 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
391 for (
unsigned int k = row_start; k < row_end; k++)
393 unsigned int j = sp_mat_col_indices[k];
394 NumericT x = sp_mat_elements[k];
395 NumericT y = d_mat[ DMatIndexT::apply(col, j,
396 d_mat_row_start, d_mat_row_inc,
397 d_mat_col_start, d_mat_col_inc,
398 d_mat_internal_rows, d_mat_internal_cols) ];
403 result [ ResultIndexT::apply(
row, col,
404 result_row_start, result_row_inc,
405 result_col_start, result_col_inc,
406 result_internal_rows, result_internal_cols) ] = r;
421 template<
typename NumericT,
unsigned int AlignmentV>
429 if (d_mat.lhs().row_major() && result.
row_major())
432 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
433 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
434 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
436 detail::cuda_arg<NumericT>(d_mat.lhs()),
442 detail::cuda_arg<NumericT>(result),
450 else if (d_mat.lhs().row_major() && !result.
row_major())
453 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
454 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
455 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
457 detail::cuda_arg<NumericT>(d_mat.lhs()),
463 detail::cuda_arg<NumericT>(result),
471 else if (!d_mat.lhs().row_major() && result.
row_major())
474 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
475 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
476 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
478 detail::cuda_arg<NumericT>(d_mat.lhs()),
484 detail::cuda_arg<NumericT>(result),
495 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
496 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
497 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
499 detail::cuda_arg<NumericT>(d_mat.lhs()),
505 detail::cuda_arg<NumericT>(result),
520 template<
typename NumericT>
522 const unsigned int * row_indices,
523 const unsigned int * column_indices,
524 const NumericT * elements,
528 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
530 row += gridDim.x * blockDim.x)
532 NumericT
diag = NumericT(0);
533 unsigned int row_end = row_indices[
row+1];
534 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
536 unsigned int col_index = column_indices[i];
537 if (col_index ==
row)
553 template<
typename SparseMatrixT,
typename NumericT>
559 csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
560 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
561 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
562 detail::cuda_arg<NumericT>(vec),
563 static_cast<unsigned int>(mat.size1())
574 template<
typename SparseMatrixT,
typename NumericT>
580 csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
581 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
582 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
583 detail::cuda_arg<NumericT>(vec),
584 static_cast<unsigned int>(mat.size1())
596 template<
typename SparseMatrixT,
typename NumericT>
602 csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
603 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
604 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
605 detail::cuda_arg<NumericT>(vec),
606 static_cast<unsigned int>(mat.size1())
617 template<
typename SparseMatrixT,
typename NumericT>
623 csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
624 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
625 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
626 detail::cuda_arg<NumericT>(vec),
627 static_cast<unsigned int>(mat.size1())
641 template<
typename SparseMatrixT,
typename NumericT>
647 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
648 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
649 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
650 detail::cuda_arg<NumericT>(vec),
651 static_cast<unsigned int>(mat.
lhs().size1())
662 template<
typename SparseMatrixT,
typename NumericT>
670 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
671 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
672 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
673 detail::cuda_arg<NumericT>(diagonal),
674 static_cast<unsigned int>(mat.
size1())
677 csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
678 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
679 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
680 detail::cuda_arg<NumericT>(diagonal),
681 detail::cuda_arg<NumericT>(vec),
682 static_cast<unsigned int>(mat.
lhs().size1())
693 template<
typename SparseMatrixT,
typename NumericT>
699 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
700 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
701 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
702 detail::cuda_arg<NumericT>(vec),
703 static_cast<unsigned int>(mat.
lhs().size1())
714 template<
typename SparseMatrixT,
typename NumericT>
722 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
723 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
724 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
725 detail::cuda_arg<NumericT>(diagonal),
726 static_cast<unsigned int>(mat.
size1())
729 csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
730 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
731 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
732 detail::cuda_arg<NumericT>(diagonal),
733 detail::cuda_arg<NumericT>(vec),
734 static_cast<unsigned int>(mat.
lhs().size1())
744 template<
typename NumericT,
unsigned int AlignmentV>
753 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
754 detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
755 detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
756 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
757 detail::cuda_arg<NumericT>(vec),
758 static_cast<unsigned int>(L.lhs().size1())
763 template<
typename NumericT,
unsigned int AlignmentV>
772 csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
773 detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
774 detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
775 detail::cuda_arg<NumericT>(U_diagonal.
handle().cuda_handle()),
776 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
777 detail::cuda_arg<NumericT>(vec),
778 static_cast<unsigned int>(U.lhs().size1())
790 template<
typename NumericT>
792 const unsigned int * row_jumper,
793 const unsigned int * row_indices,
794 const unsigned int * column_indices,
795 const NumericT * elements,
796 unsigned int nonzero_rows,
798 unsigned int start_x,
801 unsigned int start_result,
802 unsigned int inc_result,
803 unsigned int size_result)
805 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
807 i += gridDim.x * blockDim.x)
809 result[i * inc_result + start_result] = 0;
812 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
814 i += gridDim.x * blockDim.x)
817 unsigned int row_end = row_jumper[i+1];
818 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
819 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
820 result[row_indices[i] * inc_result + start_result] =
dot_prod;
833 template<
typename NumericT>
838 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
839 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
840 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
841 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
842 static_cast<unsigned int>(mat.
nnz1()),
843 detail::cuda_arg<NumericT>(vec),
844 static_cast<unsigned int>(vec.
start()),
845 static_cast<unsigned int>(vec.
stride()),
846 detail::cuda_arg<NumericT>(result),
847 static_cast<unsigned int>(result.
start()),
848 static_cast<unsigned int>(result.
stride()),
849 static_cast<unsigned int>(result.
size())
862 template<
typename NumericT>
864 const NumericT * elements,
865 const unsigned int * group_boundaries,
869 __shared__
unsigned int shared_rows[128];
870 __shared__ NumericT inter_results[128];
874 unsigned int last_index = blockDim.x - 1;
875 unsigned int group_start = group_boundaries[blockIdx.x];
876 unsigned int group_end = group_boundaries[blockIdx.x + 1];
877 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
879 unsigned int local_index = 0;
881 for (
unsigned int k = 0; k < k_end; ++k)
883 local_index = group_start + k * blockDim.x + threadIdx.x;
885 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
886 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
889 if (threadIdx.x == 0 && k > 0)
891 if (tmp.x == shared_rows[last_index])
897 val =
max(val, fabs(inter_results[last_index]));
901 val = fabs(val) + inter_results[last_index];
905 val = sqrt(val * val + inter_results[last_index]);
919 result[shared_rows[last_index]] = inter_results[last_index];
923 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
932 shared_rows[threadIdx.x] = tmp.x;
937 inter_results[threadIdx.x] = val;
940 inter_results[threadIdx.x] = fabs(val);
943 inter_results[threadIdx.x] = val * val;
952 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
958 inter_results[threadIdx.x] =
max(inter_results[threadIdx.x], left);
962 inter_results[threadIdx.x] += left;
966 inter_results[threadIdx.x] += left;
976 if (threadIdx.x != last_index &&
977 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
978 inter_results[threadIdx.x] != 0)
980 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
986 if (threadIdx.x == last_index && inter_results[last_index] != 0)
987 result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
990 template<
typename NumericT,
unsigned int AlignmentV>
995 coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
996 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
997 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
998 detail::cuda_arg<NumericT>(vec),
999 static_cast<unsigned int>(info_selector)
1007 template<
typename NumericT>
1009 const NumericT * elements,
1010 const unsigned int * group_boundaries,
1012 unsigned int start_x,
1015 unsigned int start_result,
1016 unsigned int inc_result
1019 __shared__
unsigned int shared_rows[128];
1020 __shared__ NumericT inter_results[128];
1024 unsigned int group_start = group_boundaries[blockIdx.x];
1025 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1026 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1028 unsigned int local_index = 0;
1030 for (
unsigned int k = 0; k < k_end; ++k)
1032 local_index = group_start + k * blockDim.x + threadIdx.x;
1034 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1035 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1038 if (threadIdx.x == 0 && k > 0)
1040 if (tmp.x == shared_rows[blockDim.x-1])
1041 val += inter_results[blockDim.x-1];
1043 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1048 shared_rows[threadIdx.x] = tmp.x;
1049 inter_results[threadIdx.x] = val;
1055 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1057 inter_results[threadIdx.x] += left;
1062 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1063 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1065 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1071 if (local_index + 1 == group_end)
1072 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1084 template<
typename NumericT,
unsigned int AlignmentV>
1091 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
1092 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1093 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1094 detail::cuda_arg<NumericT>(vec),
1095 static_cast<unsigned int>(vec.
start()),
1096 static_cast<unsigned int>(vec.
stride()),
1097 detail::cuda_arg<NumericT>(result),
1098 static_cast<unsigned int>(result.
start()),
1099 static_cast<unsigned int>(result.
stride())
1107 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1109 const NumericT * elements,
1110 const unsigned int * group_boundaries,
1111 const NumericT * d_mat,
1112 unsigned int d_mat_row_start,
1113 unsigned int d_mat_col_start,
1114 unsigned int d_mat_row_inc,
1115 unsigned int d_mat_col_inc,
1116 unsigned int d_mat_row_size,
1117 unsigned int d_mat_col_size,
1118 unsigned int d_mat_internal_rows,
1119 unsigned int d_mat_internal_cols,
1121 unsigned int result_row_start,
1122 unsigned int result_col_start,
1123 unsigned int result_row_inc,
1124 unsigned int result_col_inc,
1125 unsigned int result_row_size,
1126 unsigned int result_col_size,
1127 unsigned int result_internal_rows,
1128 unsigned int result_internal_cols)
1130 __shared__
unsigned int shared_rows[128];
1131 __shared__ NumericT inter_results[128];
1135 unsigned int group_start = group_boundaries[blockIdx.x];
1136 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1137 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1139 unsigned int local_index = 0;
1141 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1143 for (
unsigned int k = 0; k < k_end; ++k)
1145 local_index = group_start + k * blockDim.x + threadIdx.x;
1147 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1148 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1149 d_mat_row_start, d_mat_row_inc,
1150 d_mat_col_start, d_mat_col_inc,
1151 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1154 if (threadIdx.x == 0 && k > 0)
1156 if (tmp.x == shared_rows[blockDim.x-1])
1157 val += inter_results[blockDim.x-1];
1159 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1160 result_row_start, result_row_inc,
1161 result_col_start, result_col_inc,
1162 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1167 shared_rows[threadIdx.x] = tmp.x;
1168 inter_results[threadIdx.x] = val;
1174 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1176 inter_results[threadIdx.x] += left;
1181 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1182 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1184 result[ResultIndexT::apply(tmp.x, result_col,
1185 result_row_start, result_row_inc,
1186 result_col_start, result_col_inc,
1187 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1193 if (local_index + 1 == group_end)
1194 result[ResultIndexT::apply(tmp.x, result_col,
1195 result_row_start, result_row_inc,
1196 result_col_start, result_col_inc,
1197 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1210 template<
typename NumericT,
unsigned int AlignmentV>
1218 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1219 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1220 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1222 detail::cuda_arg<NumericT>(d_mat),
1228 detail::cuda_arg<NumericT>(result),
1239 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1240 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1241 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1243 detail::cuda_arg<NumericT>(d_mat),
1249 detail::cuda_arg<NumericT>(result),
1260 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1261 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1262 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1264 detail::cuda_arg<NumericT>(d_mat),
1270 detail::cuda_arg<NumericT>(result),
1281 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1282 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1283 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1285 detail::cuda_arg<NumericT>(d_mat),
1291 detail::cuda_arg<NumericT>(result),
1302 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1304 const NumericT * elements,
1305 const unsigned int * group_boundaries,
1306 const NumericT * d_mat,
1307 unsigned int d_mat_row_start,
1308 unsigned int d_mat_col_start,
1309 unsigned int d_mat_row_inc,
1310 unsigned int d_mat_col_inc,
1311 unsigned int d_mat_row_size,
1312 unsigned int d_mat_col_size,
1313 unsigned int d_mat_internal_rows,
1314 unsigned int d_mat_internal_cols,
1316 unsigned int result_row_start,
1317 unsigned int result_col_start,
1318 unsigned int result_row_inc,
1319 unsigned int result_col_inc,
1320 unsigned int result_row_size,
1321 unsigned int result_col_size,
1322 unsigned int result_internal_rows,
1323 unsigned int result_internal_cols)
1325 __shared__
unsigned int shared_rows[128];
1326 __shared__ NumericT inter_results[128];
1330 unsigned int group_start = group_boundaries[blockIdx.x];
1331 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1332 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1334 unsigned int local_index = 0;
1336 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1338 for (
unsigned int k = 0; k < k_end; ++k)
1340 local_index = group_start + k * blockDim.x + threadIdx.x;
1342 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1343 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1344 d_mat_row_start, d_mat_row_inc,
1345 d_mat_col_start, d_mat_col_inc,
1346 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1349 if (threadIdx.x == 0 && k > 0)
1351 if (tmp.x == shared_rows[blockDim.x-1])
1352 val += inter_results[blockDim.x-1];
1354 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1355 result_row_start, result_row_inc,
1356 result_col_start, result_col_inc,
1357 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1362 shared_rows[threadIdx.x] = tmp.x;
1363 inter_results[threadIdx.x] = val;
1369 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1371 inter_results[threadIdx.x] += left;
1376 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1377 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1379 result[ ResultIndexT::apply(tmp.x, result_col,
1380 result_row_start, result_row_inc,
1381 result_col_start, result_col_inc,
1382 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1388 if (local_index + 1 == group_end)
1389 result[ ResultIndexT::apply(tmp.x, result_col,
1390 result_row_start, result_row_inc,
1391 result_col_start, result_col_inc,
1392 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1404 template<
typename NumericT,
unsigned int AlignmentV>
1411 if (d_mat.lhs().row_major() && result.
row_major())
1414 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1415 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1416 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1418 detail::cuda_arg<NumericT>(d_mat.lhs()),
1424 detail::cuda_arg<NumericT>(result),
1432 else if (d_mat.lhs().row_major() && !result.
row_major())
1435 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1436 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1437 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1439 detail::cuda_arg<NumericT>(d_mat.lhs()),
1445 detail::cuda_arg<NumericT>(result),
1453 else if (!d_mat.lhs().row_major() && result.
row_major())
1456 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1457 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1458 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1460 detail::cuda_arg<NumericT>(d_mat.lhs()),
1466 detail::cuda_arg<NumericT>(result),
1477 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1478 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1479 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1481 detail::cuda_arg<NumericT>(d_mat.lhs()),
1487 detail::cuda_arg<NumericT>(result),
1502 template<
typename NumericT>
1504 const NumericT * elements,
1506 unsigned int start_x,
1509 unsigned int start_result,
1510 unsigned int inc_result,
1511 unsigned int row_num,
1512 unsigned int col_num,
1513 unsigned int internal_row_num,
1514 unsigned int items_per_row,
1515 unsigned int aligned_items_per_row
1518 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1519 unsigned int glb_sz = gridDim.x * blockDim.x;
1521 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1525 unsigned int offset = row_id;
1526 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1528 NumericT val = elements[offset];
1530 if (val != NumericT(0))
1532 int col = coords[offset];
1533 sum += x[col * inc_x + start_x] * val;
1537 result[row_id * inc_result + start_result] =
sum;
1550 template<
typename NumericT,
unsigned int AlignmentV>
1555 ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1556 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1557 detail::cuda_arg<NumericT>(vec),
1558 static_cast<unsigned int>(vec.
start()),
1559 static_cast<unsigned int>(vec.
stride()),
1560 detail::cuda_arg<NumericT>(result),
1561 static_cast<unsigned int>(result.
start()),
1562 static_cast<unsigned int>(result.
stride()),
1563 static_cast<unsigned int>(mat.
size1()),
1564 static_cast<unsigned int>(mat.
size2()),
1566 static_cast<unsigned int>(mat.
maxnnz()),
1572 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1574 const NumericT * sp_mat_elements,
1575 unsigned int sp_mat_row_num,
1576 unsigned int sp_mat_col_num,
1577 unsigned int sp_mat_internal_row_num,
1578 unsigned int sp_mat_items_per_row,
1579 unsigned int sp_mat_aligned_items_per_row,
1580 const NumericT * d_mat,
1581 unsigned int d_mat_row_start,
1582 unsigned int d_mat_col_start,
1583 unsigned int d_mat_row_inc,
1584 unsigned int d_mat_col_inc,
1585 unsigned int d_mat_row_size,
1586 unsigned int d_mat_col_size,
1587 unsigned int d_mat_internal_rows,
1588 unsigned int d_mat_internal_cols,
1590 unsigned int result_row_start,
1591 unsigned int result_col_start,
1592 unsigned int result_row_inc,
1593 unsigned int result_col_inc,
1594 unsigned int result_row_size,
1595 unsigned int result_col_size,
1596 unsigned int result_internal_rows,
1597 unsigned int result_internal_cols)
1599 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1600 unsigned int glb_sz = gridDim.x * blockDim.x;
1602 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1604 unsigned int row = rc % sp_mat_row_num;
1605 unsigned int col = rc / sp_mat_row_num;
1607 unsigned int offset =
row;
1608 NumericT r = (NumericT)0;
1610 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1612 unsigned int j = sp_mat_coords[offset];
1613 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1615 if (x != (NumericT)0)
1617 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1618 d_mat_row_start, d_mat_row_inc,
1619 d_mat_col_start, d_mat_col_inc,
1620 d_mat_internal_rows, d_mat_internal_cols) ];
1625 result [ ResultIndexT::apply(row, col,
1626 result_row_start, result_row_inc,
1627 result_col_start, result_col_inc,
1628 result_internal_rows, result_internal_cols) ] = r;
1642 template<
typename NumericT,
unsigned int AlignmentV>
1650 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1651 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1652 static_cast<unsigned int>(sp_mat.
size1()),
1653 static_cast<unsigned int>(sp_mat.
size2()),
1655 static_cast<unsigned int>(sp_mat.
maxnnz()),
1657 detail::cuda_arg<NumericT>(d_mat),
1663 detail::cuda_arg<NumericT>(result),
1674 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1675 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1676 static_cast<unsigned int>(sp_mat.
size1()),
1677 static_cast<unsigned int>(sp_mat.
size2()),
1679 static_cast<unsigned int>(sp_mat.
maxnnz()),
1681 detail::cuda_arg<NumericT>(d_mat),
1687 detail::cuda_arg<NumericT>(result),
1698 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1699 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1700 static_cast<unsigned int>(sp_mat.
size1()),
1701 static_cast<unsigned int>(sp_mat.
size2()),
1703 static_cast<unsigned int>(sp_mat.
maxnnz()),
1705 detail::cuda_arg<NumericT>(d_mat),
1711 detail::cuda_arg<NumericT>(result),
1722 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1723 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1724 static_cast<unsigned int>(sp_mat.
size1()),
1725 static_cast<unsigned int>(sp_mat.
size2()),
1727 static_cast<unsigned int>(sp_mat.
maxnnz()),
1729 detail::cuda_arg<NumericT>(d_mat),
1735 detail::cuda_arg<NumericT>(result),
1745 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT >
1747 const NumericT * sp_mat_elements,
1748 unsigned int sp_mat_row_num,
1749 unsigned int sp_mat_col_num,
1750 unsigned int sp_mat_internal_row_num,
1751 unsigned int sp_mat_items_per_row,
1752 unsigned int sp_mat_aligned_items_per_row,
1753 const NumericT * d_mat,
1754 unsigned int d_mat_row_start,
1755 unsigned int d_mat_col_start,
1756 unsigned int d_mat_row_inc,
1757 unsigned int d_mat_col_inc,
1758 unsigned int d_mat_row_size,
1759 unsigned int d_mat_col_size,
1760 unsigned int d_mat_internal_rows,
1761 unsigned int d_mat_internal_cols,
1763 unsigned int result_row_start,
1764 unsigned int result_col_start,
1765 unsigned int result_row_inc,
1766 unsigned int result_col_inc,
1767 unsigned int result_row_size,
1768 unsigned int result_col_size,
1769 unsigned int result_internal_rows,
1770 unsigned int result_internal_cols)
1772 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1773 unsigned int glb_sz = gridDim.x * blockDim.x;
1775 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1777 unsigned int row = rc % sp_mat_row_num;
1778 unsigned int col = rc / sp_mat_row_num;
1780 unsigned int offset =
row;
1781 NumericT r = (NumericT)0;
1783 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1785 unsigned int j = sp_mat_coords[offset];
1786 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1788 if (x != (NumericT)0)
1790 NumericT y = d_mat[ DMatIndexT::apply(col, j,
1791 d_mat_row_start, d_mat_row_inc,
1792 d_mat_col_start, d_mat_col_inc,
1793 d_mat_internal_rows, d_mat_internal_cols) ];
1798 result [ ResultIndexT::apply(row, col,
1799 result_row_start, result_row_inc,
1800 result_col_start, result_col_inc,
1801 result_internal_rows, result_internal_cols) ] = r;
1815 template<
typename NumericT,
unsigned int AlignmentV>
1822 if (d_mat.lhs().row_major() && result.
row_major())
1825 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1826 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1827 static_cast<unsigned int>(sp_mat.
size1()),
1828 static_cast<unsigned int>(sp_mat.
size2()),
1830 static_cast<unsigned int>(sp_mat.
maxnnz()),
1833 detail::cuda_arg<NumericT>(d_mat.lhs()),
1839 detail::cuda_arg<NumericT>(result),
1847 else if (d_mat.lhs().row_major() && !result.
row_major())
1850 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1851 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1852 static_cast<unsigned int>(sp_mat.
size1()),
1853 static_cast<unsigned int>(sp_mat.
size2()),
1855 static_cast<unsigned int>(sp_mat.
maxnnz()),
1858 detail::cuda_arg<NumericT>(d_mat.lhs()),
1864 detail::cuda_arg<NumericT>(result),
1872 else if (!d_mat.lhs().row_major() && result.
row_major())
1875 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1876 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1877 static_cast<unsigned int>(sp_mat.
size1()),
1878 static_cast<unsigned int>(sp_mat.
size2()),
1880 static_cast<unsigned int>(sp_mat.
maxnnz()),
1883 detail::cuda_arg<NumericT>(d_mat.lhs()),
1889 detail::cuda_arg<NumericT>(result),
1900 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1901 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1902 static_cast<unsigned int>(sp_mat.
size1()),
1903 static_cast<unsigned int>(sp_mat.
size2()),
1905 static_cast<unsigned int>(sp_mat.
maxnnz()),
1908 detail::cuda_arg<NumericT>(d_mat.lhs()),
1914 detail::cuda_arg<NumericT>(result),
1928 template<
typename NumericT>
1930 const unsigned int * column_indices,
1931 const unsigned int * block_start,
1932 const NumericT * elements,
1934 unsigned int start_x,
1936 unsigned int size_x,
1938 unsigned int start_result,
1939 unsigned int inc_result,
1940 unsigned int size_result)
1942 unsigned int local_id = threadIdx.x;
1943 unsigned int local_size = blockDim.x;
1944 unsigned int num_rows = size_result;
1946 for (
unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
1948 unsigned int row = block_idx * local_size + local_id;
1949 unsigned int offset = block_start[block_idx];
1950 unsigned int num_columns = columns_per_block[block_idx];
1953 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
1955 unsigned int index = offset + item_id * local_size + local_id;
1956 NumericT val = elements[index];
1958 sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
1962 result[row * inc_result + start_result] =
sum;
1974 template<
typename NumericT,
typename IndexT>
1980 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1981 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1982 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1983 detail::cuda_arg<NumericT>(vec),
1984 static_cast<unsigned int>(vec.
start()),
1985 static_cast<unsigned int>(vec.
stride()),
1986 static_cast<unsigned int>(vec.
size()),
1987 detail::cuda_arg<NumericT>(result),
1988 static_cast<unsigned int>(result.
start()),
1989 static_cast<unsigned int>(result.
stride()),
1990 static_cast<unsigned int>(result.
size())
2001 template<
typename NumericT>
2003 const NumericT * ell_elements,
2004 const unsigned int * csr_rows,
2005 const unsigned int * csr_cols,
2006 const NumericT * csr_elements,
2008 unsigned int start_x,
2011 unsigned int start_result,
2012 unsigned int inc_result,
2013 unsigned int row_num,
2014 unsigned int internal_row_num,
2015 unsigned int items_per_row,
2016 unsigned int aligned_items_per_row
2019 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2020 unsigned int glb_sz = gridDim.x * blockDim.x;
2022 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2026 unsigned int offset = row_id;
2027 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2029 NumericT val = ell_elements[offset];
2032 if (val != NumericT(0))
2034 int col = ell_coords[offset];
2035 sum += (x[col * inc_x + start_x] * val);
2039 unsigned int col_begin = csr_rows[row_id];
2040 unsigned int col_end = csr_rows[row_id + 1];
2042 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2043 sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2045 result[row_id * inc_result + start_result] =
sum;
2059 template<
typename NumericT,
unsigned int AlignmentV>
2064 hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2065 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2066 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2067 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2068 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2069 detail::cuda_arg<NumericT>(vec),
2070 static_cast<unsigned int>(vec.
start()),
2071 static_cast<unsigned int>(vec.
stride()),
2072 detail::cuda_arg<NumericT>(result),
2073 static_cast<unsigned int>(result.
start()),
2074 static_cast<unsigned int>(result.
stride()),
2075 static_cast<unsigned int>(mat.
size1()),
2077 static_cast<unsigned int>(mat.
ell_nnz()),
2085 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2087 const NumericT * ell_elements,
2088 const unsigned int * csr_rows,
2089 const unsigned int * csr_cols,
2090 const NumericT * csr_elements,
2091 unsigned int row_num,
2092 unsigned int internal_row_num,
2093 unsigned int items_per_row,
2094 unsigned int aligned_items_per_row,
2095 const NumericT * d_mat,
2096 unsigned int d_mat_row_start,
2097 unsigned int d_mat_col_start,
2098 unsigned int d_mat_row_inc,
2099 unsigned int d_mat_col_inc,
2100 unsigned int d_mat_row_size,
2101 unsigned int d_mat_col_size,
2102 unsigned int d_mat_internal_rows,
2103 unsigned int d_mat_internal_cols,
2105 unsigned int result_row_start,
2106 unsigned int result_col_start,
2107 unsigned int result_row_inc,
2108 unsigned int result_col_inc,
2109 unsigned int result_row_size,
2110 unsigned int result_col_size,
2111 unsigned int result_internal_rows,
2112 unsigned int result_internal_cols)
2114 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2115 unsigned int glb_sz = gridDim.x * blockDim.x;
2117 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2119 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2123 unsigned int offset = row_id;
2124 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2126 NumericT val = ell_elements[offset];
2130 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2131 d_mat_row_start, d_mat_row_inc,
2132 d_mat_col_start, d_mat_col_inc,
2133 d_mat_internal_rows, d_mat_internal_cols)] * val;
2137 unsigned int col_begin = csr_rows[row_id];
2138 unsigned int col_end = csr_rows[row_id + 1];
2140 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2142 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2143 d_mat_row_start, d_mat_row_inc,
2144 d_mat_col_start, d_mat_col_inc,
2145 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2148 result[ResultIndexT::apply(row_id, result_col,
2149 result_row_start, result_row_inc,
2150 result_col_start, result_col_inc,
2151 result_internal_rows, result_internal_cols)] =
sum;
2166 template<
typename NumericT,
unsigned int AlignmentV>
2174 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2175 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2176 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2177 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2178 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2179 static_cast<unsigned int>(mat.
size1()),
2181 static_cast<unsigned int>(mat.
ell_nnz()),
2184 detail::cuda_arg<NumericT>(d_mat),
2190 detail::cuda_arg<NumericT>(result),
2201 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2202 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2203 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2204 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2205 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2206 static_cast<unsigned int>(mat.
size1()),
2208 static_cast<unsigned int>(mat.
ell_nnz()),
2211 detail::cuda_arg<NumericT>(d_mat),
2217 detail::cuda_arg<NumericT>(result),
2228 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2229 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2230 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2231 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2232 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2233 static_cast<unsigned int>(mat.
size1()),
2235 static_cast<unsigned int>(mat.
ell_nnz()),
2238 detail::cuda_arg<NumericT>(d_mat),
2244 detail::cuda_arg<NumericT>(result),
2255 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2256 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2257 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2258 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2259 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2260 static_cast<unsigned int>(mat.
size1()),
2262 static_cast<unsigned int>(mat.
ell_nnz()),
2265 detail::cuda_arg<NumericT>(d_mat),
2271 detail::cuda_arg<NumericT>(result),
2283 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2285 const NumericT * ell_elements,
2286 const unsigned int * csr_rows,
2287 const unsigned int * csr_cols,
2288 const NumericT * csr_elements,
2289 unsigned int row_num,
2290 unsigned int internal_row_num,
2291 unsigned int items_per_row,
2292 unsigned int aligned_items_per_row,
2293 const NumericT * d_mat,
2294 unsigned int d_mat_row_start,
2295 unsigned int d_mat_col_start,
2296 unsigned int d_mat_row_inc,
2297 unsigned int d_mat_col_inc,
2298 unsigned int d_mat_row_size,
2299 unsigned int d_mat_col_size,
2300 unsigned int d_mat_internal_rows,
2301 unsigned int d_mat_internal_cols,
2303 unsigned int result_row_start,
2304 unsigned int result_col_start,
2305 unsigned int result_row_inc,
2306 unsigned int result_col_inc,
2307 unsigned int result_row_size,
2308 unsigned int result_col_size,
2309 unsigned int result_internal_rows,
2310 unsigned int result_internal_cols)
2312 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2313 unsigned int glb_sz = gridDim.x * blockDim.x;
2315 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2317 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2321 unsigned int offset = row_id;
2322 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2324 NumericT val = ell_elements[offset];
2328 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2329 d_mat_row_start, d_mat_row_inc,
2330 d_mat_col_start, d_mat_col_inc,
2331 d_mat_internal_rows, d_mat_internal_cols)] * val;
2335 unsigned int col_begin = csr_rows[row_id];
2336 unsigned int col_end = csr_rows[row_id + 1];
2338 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2340 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2341 d_mat_row_start, d_mat_row_inc,
2342 d_mat_col_start, d_mat_col_inc,
2343 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2346 result[ResultIndexT::apply(row_id, result_col,
2347 result_row_start, result_row_inc,
2348 result_col_start, result_col_inc,
2349 result_internal_rows, result_internal_cols)] =
sum;
2364 template<
typename NumericT,
unsigned int AlignmentV>
2371 if (d_mat.lhs().row_major() && result.
row_major())
2374 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2375 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2376 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2377 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2378 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2379 static_cast<unsigned int>(mat.
size1()),
2381 static_cast<unsigned int>(mat.
ell_nnz()),
2384 detail::cuda_arg<NumericT>(d_mat.lhs()),
2390 detail::cuda_arg<NumericT>(result),
2398 else if (d_mat.lhs().row_major() && !result.
row_major())
2401 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2402 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2403 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2404 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2405 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2406 static_cast<unsigned int>(mat.
size1()),
2408 static_cast<unsigned int>(mat.
ell_nnz()),
2411 detail::cuda_arg<NumericT>(d_mat.lhs()),
2417 detail::cuda_arg<NumericT>(result),
2425 else if (!d_mat.lhs().row_major() && result.
row_major())
2428 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2429 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2430 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2431 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2432 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2433 static_cast<unsigned int>(mat.
size1()),
2435 static_cast<unsigned int>(mat.
ell_nnz()),
2438 detail::cuda_arg<NumericT>(d_mat.lhs()),
2444 detail::cuda_arg<NumericT>(result),
2455 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2456 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2457 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2458 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2459 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2460 static_cast<unsigned int>(mat.
size1()),
2462 static_cast<unsigned int>(mat.
ell_nnz()),
2465 detail::cuda_arg<NumericT>(d_mat.lhs()),
2471 detail::cuda_arg<NumericT>(result),
vcl_size_t internal_ellnnz() const
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern.
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const handle_type & handle() const
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size1() const
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
vcl_size_t size1() const
Returns the size of the result vector.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
const handle_type & handle2() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
A tag class representing an upper triangular matrix.
void inplace_solve(const matrix_base< NumericT > &A, bool trans_A, matrix_base< NumericT > &B, bool trans_B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
A tag class representing a lower triangular matrix with unit diagonal.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix.
const handle_type & handle5() const
LHS & lhs() const
Get left hand side operand.
size_type start() const
Returns the offset within the buffer.
vcl_size_t internal_maxnnz() const
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
A tag class representing an upper triangular matrix with unit diagonal.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)