1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
46 template<
typename NumericT>
48 const unsigned int * row_indices,
49 const unsigned int * column_indices,
50 const NumericT * elements,
55 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
57 row += gridDim.x * blockDim.x)
60 unsigned int row_end = row_indices[
row+1];
65 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
66 value =
max(value, fabs(elements[i]));
70 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
71 value += fabs(elements[i]);
75 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
76 value += elements[i] * elements[i];
81 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
83 if (column_indices[i] ==
row)
99 template<
typename NumericT,
unsigned int AligmentV>
104 csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
105 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
106 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
107 detail::cuda_arg<NumericT>(vec),
108 static_cast<unsigned int>(mat.
size1()),
109 static_cast<unsigned int>(info_selector)
117 template<
typename NumericT>
119 const unsigned int * row_indices,
120 const unsigned int * column_indices,
121 const NumericT * elements,
123 unsigned int start_x,
126 unsigned int start_result,
127 unsigned int inc_result,
128 unsigned int size_result)
130 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
132 row += gridDim.x * blockDim.x)
135 unsigned int row_end = row_indices[
row+1];
136 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
137 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138 result[
row * inc_result + start_result] =
dot_prod;
145 template<
typename NumericT>
147 const unsigned int * row_indices,
148 const unsigned int * column_indices,
149 const unsigned int * row_blocks,
150 const NumericT * elements,
151 unsigned int num_blocks,
153 unsigned int start_x,
156 unsigned int start_result,
157 unsigned int inc_result,
158 unsigned int size_result)
160 __shared__ NumericT shared_elements[1024];
162 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
164 unsigned int row_start = row_blocks[block_id];
165 unsigned int row_stop = row_blocks[block_id + 1];
166 unsigned int element_start = row_indices[row_start];
167 unsigned int element_stop = row_indices[row_stop];
168 unsigned int rows_to_process = row_stop - row_start;
170 if (rows_to_process > 1)
173 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
174 shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
179 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
182 unsigned int thread_row_start = row_indices[
row] - element_start;
183 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
184 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
185 dot_prod += shared_elements[i];
186 result[
row * inc_result + start_result] =
dot_prod;
193 shared_elements[threadIdx.x] = 0;
194 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195 shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
202 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
205 if (threadIdx.x == 0)
206 result[row_start * inc_result + start_result] = shared_elements[0];
224 template<
class NumericT,
unsigned int AlignmentV>
229 compressed_matrix_vec_mul_adaptive_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
230 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
231 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
232 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
233 static_cast<unsigned int>(mat.
blocks1()),
234 detail::cuda_arg<NumericT>(vec),
235 static_cast<unsigned int>(vec.
start()),
236 static_cast<unsigned int>(vec.
stride()),
237 detail::cuda_arg<NumericT>(result),
238 static_cast<unsigned int>(result.
start()),
239 static_cast<unsigned int>(result.
stride()),
240 static_cast<unsigned int>(result.
size())
249 template<
typename LayoutT>
252 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
253 unsigned int row_start,
unsigned int row_inc,
254 unsigned int col_start,
unsigned int col_inc,
255 unsigned int internal_rows,
unsigned int internal_cols)
257 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
265 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
266 unsigned int row_start,
unsigned int row_inc,
267 unsigned int col_start,
unsigned int col_inc,
268 unsigned int internal_rows,
unsigned int internal_cols)
270 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
276 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
278 const unsigned int * sp_mat_row_indices,
279 const unsigned int * sp_mat_col_indices,
280 const NumericT * sp_mat_elements,
281 const NumericT * d_mat,
282 unsigned int d_mat_row_start,
283 unsigned int d_mat_col_start,
284 unsigned int d_mat_row_inc,
285 unsigned int d_mat_col_inc,
286 unsigned int d_mat_row_size,
287 unsigned int d_mat_col_size,
288 unsigned int d_mat_internal_rows,
289 unsigned int d_mat_internal_cols,
291 unsigned int result_row_start,
292 unsigned int result_col_start,
293 unsigned int result_row_inc,
294 unsigned int result_col_inc,
295 unsigned int result_row_size,
296 unsigned int result_col_size,
297 unsigned int result_internal_rows,
298 unsigned int result_internal_cols)
300 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
302 unsigned int row_start = sp_mat_row_indices[
row];
303 unsigned int row_end = sp_mat_row_indices[
row+1];
305 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
309 for (
unsigned int k = row_start; k < row_end; k++)
311 unsigned int j = sp_mat_col_indices[k];
312 NumericT x = sp_mat_elements[k];
313 NumericT y = d_mat[ DMatIndexT::apply(j, col,
314 d_mat_row_start, d_mat_row_inc,
315 d_mat_col_start, d_mat_col_inc,
316 d_mat_internal_rows, d_mat_internal_cols) ];
321 result[ResultIndexT::apply(
row, col,
322 result_row_start, result_row_inc,
323 result_col_start, result_col_inc,
324 result_internal_rows, result_internal_cols)] = r;
338 template<
typename NumericT,
unsigned int AlignmentV>
346 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
347 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
348 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
350 detail::cuda_arg<NumericT>(d_mat),
356 detail::cuda_arg<NumericT>(result),
367 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
368 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
369 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
371 detail::cuda_arg<NumericT>(d_mat),
377 detail::cuda_arg<NumericT>(result),
388 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
389 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
390 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
392 detail::cuda_arg<NumericT>(d_mat),
398 detail::cuda_arg<NumericT>(result),
409 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
410 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
411 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
413 detail::cuda_arg<NumericT>(d_mat),
419 detail::cuda_arg<NumericT>(result),
430 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
432 const unsigned int * sp_mat_row_indices,
433 const unsigned int * sp_mat_col_indices,
434 const NumericT * sp_mat_elements,
435 const NumericT * d_mat,
436 unsigned int d_mat_row_start,
437 unsigned int d_mat_col_start,
438 unsigned int d_mat_row_inc,
439 unsigned int d_mat_col_inc,
440 unsigned int d_mat_row_size,
441 unsigned int d_mat_col_size,
442 unsigned int d_mat_internal_rows,
443 unsigned int d_mat_internal_cols,
445 unsigned int result_row_start,
446 unsigned int result_col_start,
447 unsigned int result_row_inc,
448 unsigned int result_col_inc,
449 unsigned int result_row_size,
450 unsigned int result_col_size,
451 unsigned int result_internal_rows,
452 unsigned int result_internal_cols)
454 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
456 unsigned int row_start = sp_mat_row_indices[
row];
457 unsigned int row_end = sp_mat_row_indices[
row+1];
459 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
463 for (
unsigned int k = row_start; k < row_end; k++)
465 unsigned int j = sp_mat_col_indices[k];
466 NumericT x = sp_mat_elements[k];
467 NumericT y = d_mat[ DMatIndexT::apply(col, j,
468 d_mat_row_start, d_mat_row_inc,
469 d_mat_col_start, d_mat_col_inc,
470 d_mat_internal_rows, d_mat_internal_cols) ];
475 result [ ResultIndexT::apply(
row, col,
476 result_row_start, result_row_inc,
477 result_col_start, result_col_inc,
478 result_internal_rows, result_internal_cols) ] = r;
493 template<
typename NumericT,
unsigned int AlignmentV>
501 if (d_mat.lhs().row_major() && result.
row_major())
504 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
505 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
506 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
508 detail::cuda_arg<NumericT>(d_mat.lhs()),
514 detail::cuda_arg<NumericT>(result),
522 else if (d_mat.lhs().row_major() && !result.
row_major())
525 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
526 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
527 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
529 detail::cuda_arg<NumericT>(d_mat.lhs()),
535 detail::cuda_arg<NumericT>(result),
543 else if (!d_mat.lhs().row_major() && result.
row_major())
546 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
547 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
548 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
550 detail::cuda_arg<NumericT>(d_mat.lhs()),
556 detail::cuda_arg<NumericT>(result),
567 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
568 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
569 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
571 detail::cuda_arg<NumericT>(d_mat.lhs()),
577 detail::cuda_arg<NumericT>(result),
592 template<
typename NumericT>
594 const unsigned int * row_indices,
595 const unsigned int * column_indices,
596 const NumericT * elements,
600 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
602 row += gridDim.x * blockDim.x)
604 NumericT
diag = NumericT(0);
605 unsigned int row_end = row_indices[
row+1];
606 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
608 unsigned int col_index = column_indices[i];
609 if (col_index ==
row)
625 template<
typename SparseMatrixT,
typename NumericT>
631 csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
632 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
633 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
634 detail::cuda_arg<NumericT>(vec),
635 static_cast<unsigned int>(mat.size1())
646 template<
typename SparseMatrixT,
typename NumericT>
652 csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
653 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
654 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
655 detail::cuda_arg<NumericT>(vec),
656 static_cast<unsigned int>(mat.size1())
668 template<
typename SparseMatrixT,
typename NumericT>
674 csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
675 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
676 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
677 detail::cuda_arg<NumericT>(vec),
678 static_cast<unsigned int>(mat.size1())
689 template<
typename SparseMatrixT,
typename NumericT>
695 csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
696 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
697 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
698 detail::cuda_arg<NumericT>(vec),
699 static_cast<unsigned int>(mat.size1())
713 template<
typename SparseMatrixT,
typename NumericT>
719 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
720 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
721 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
722 detail::cuda_arg<NumericT>(vec),
723 static_cast<unsigned int>(mat.
lhs().size1())
734 template<
typename SparseMatrixT,
typename NumericT>
742 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
743 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
744 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
745 detail::cuda_arg<NumericT>(diagonal),
746 static_cast<unsigned int>(mat.
size1())
749 csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
750 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
751 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
752 detail::cuda_arg<NumericT>(diagonal),
753 detail::cuda_arg<NumericT>(vec),
754 static_cast<unsigned int>(mat.
lhs().size1())
765 template<
typename SparseMatrixT,
typename NumericT>
771 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
772 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
773 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
774 detail::cuda_arg<NumericT>(vec),
775 static_cast<unsigned int>(mat.
lhs().size1())
786 template<
typename SparseMatrixT,
typename NumericT>
794 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
795 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
796 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
797 detail::cuda_arg<NumericT>(diagonal),
798 static_cast<unsigned int>(mat.
size1())
801 csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
802 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
803 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
804 detail::cuda_arg<NumericT>(diagonal),
805 detail::cuda_arg<NumericT>(vec),
806 static_cast<unsigned int>(mat.
lhs().size1())
816 template<
typename NumericT,
unsigned int AlignmentV>
825 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
826 detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
827 detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
828 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
829 detail::cuda_arg<NumericT>(vec),
830 static_cast<unsigned int>(L.lhs().size1())
835 template<
typename NumericT,
unsigned int AlignmentV>
844 csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
845 detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
846 detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
847 detail::cuda_arg<NumericT>(U_diagonal.
handle().cuda_handle()),
848 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
849 detail::cuda_arg<NumericT>(vec),
850 static_cast<unsigned int>(U.lhs().size1())
862 template<
typename NumericT>
864 const unsigned int * row_jumper,
865 const unsigned int * row_indices,
866 const unsigned int * column_indices,
867 const NumericT * elements,
868 unsigned int nonzero_rows,
870 unsigned int start_x,
873 unsigned int start_result,
874 unsigned int inc_result,
875 unsigned int size_result)
877 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
879 i += gridDim.x * blockDim.x)
881 result[i * inc_result + start_result] = 0;
884 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
886 i += gridDim.x * blockDim.x)
889 unsigned int row_end = row_jumper[i+1];
890 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
891 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
892 result[row_indices[i] * inc_result + start_result] =
dot_prod;
905 template<
typename NumericT>
910 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
911 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
912 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
913 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
914 static_cast<unsigned int>(mat.
nnz1()),
915 detail::cuda_arg<NumericT>(vec),
916 static_cast<unsigned int>(vec.
start()),
917 static_cast<unsigned int>(vec.
stride()),
918 detail::cuda_arg<NumericT>(result),
919 static_cast<unsigned int>(result.
start()),
920 static_cast<unsigned int>(result.
stride()),
921 static_cast<unsigned int>(result.
size())
934 template<
typename NumericT>
936 const NumericT * elements,
937 const unsigned int * group_boundaries,
941 __shared__
unsigned int shared_rows[128];
942 __shared__ NumericT inter_results[128];
946 unsigned int last_index = blockDim.x - 1;
947 unsigned int group_start = group_boundaries[blockIdx.x];
948 unsigned int group_end = group_boundaries[blockIdx.x + 1];
949 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
951 unsigned int local_index = 0;
953 for (
unsigned int k = 0; k < k_end; ++k)
955 local_index = group_start + k * blockDim.x + threadIdx.x;
957 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
958 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
961 if (threadIdx.x == 0 && k > 0)
963 if (tmp.x == shared_rows[last_index])
969 val =
max(val, fabs(inter_results[last_index]));
973 val = fabs(val) + inter_results[last_index];
977 val = sqrt(val * val + inter_results[last_index]);
991 result[shared_rows[last_index]] = inter_results[last_index];
995 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1004 shared_rows[threadIdx.x] = tmp.x;
1009 inter_results[threadIdx.x] = val;
1012 inter_results[threadIdx.x] = fabs(val);
1015 inter_results[threadIdx.x] = val * val;
1024 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1030 inter_results[threadIdx.x] =
max(inter_results[threadIdx.x], left);
1034 inter_results[threadIdx.x] += left;
1038 inter_results[threadIdx.x] += left;
1048 if (threadIdx.x != last_index &&
1049 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1050 inter_results[threadIdx.x] != 0)
1052 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1058 if (threadIdx.x == last_index && inter_results[last_index] != 0)
1059 result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
1062 template<
typename NumericT,
unsigned int AlignmentV>
1067 coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
1068 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1069 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1070 detail::cuda_arg<NumericT>(vec),
1071 static_cast<unsigned int>(info_selector)
1079 template<
typename NumericT>
1081 const NumericT * elements,
1082 const unsigned int * group_boundaries,
1084 unsigned int start_x,
1087 unsigned int start_result,
1088 unsigned int inc_result
1091 __shared__
unsigned int shared_rows[128];
1092 __shared__ NumericT inter_results[128];
1096 unsigned int group_start = group_boundaries[blockIdx.x];
1097 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1098 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1100 unsigned int local_index = 0;
1102 for (
unsigned int k = 0; k < k_end; ++k)
1104 local_index = group_start + k * blockDim.x + threadIdx.x;
1106 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1107 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1110 if (threadIdx.x == 0 && k > 0)
1112 if (tmp.x == shared_rows[blockDim.x-1])
1113 val += inter_results[blockDim.x-1];
1115 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1120 shared_rows[threadIdx.x] = tmp.x;
1121 inter_results[threadIdx.x] = val;
1127 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1129 inter_results[threadIdx.x] += left;
1134 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1135 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1137 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1143 if (local_index + 1 == group_end)
1144 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1156 template<
typename NumericT,
unsigned int AlignmentV>
1163 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
1164 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1165 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1166 detail::cuda_arg<NumericT>(vec),
1167 static_cast<unsigned int>(vec.
start()),
1168 static_cast<unsigned int>(vec.
stride()),
1169 detail::cuda_arg<NumericT>(result),
1170 static_cast<unsigned int>(result.
start()),
1171 static_cast<unsigned int>(result.
stride())
1179 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1181 const NumericT * elements,
1182 const unsigned int * group_boundaries,
1183 const NumericT * d_mat,
1184 unsigned int d_mat_row_start,
1185 unsigned int d_mat_col_start,
1186 unsigned int d_mat_row_inc,
1187 unsigned int d_mat_col_inc,
1188 unsigned int d_mat_row_size,
1189 unsigned int d_mat_col_size,
1190 unsigned int d_mat_internal_rows,
1191 unsigned int d_mat_internal_cols,
1193 unsigned int result_row_start,
1194 unsigned int result_col_start,
1195 unsigned int result_row_inc,
1196 unsigned int result_col_inc,
1197 unsigned int result_row_size,
1198 unsigned int result_col_size,
1199 unsigned int result_internal_rows,
1200 unsigned int result_internal_cols)
1202 __shared__
unsigned int shared_rows[128];
1203 __shared__ NumericT inter_results[128];
1207 unsigned int group_start = group_boundaries[blockIdx.x];
1208 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1209 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1211 unsigned int local_index = 0;
1213 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1215 for (
unsigned int k = 0; k < k_end; ++k)
1217 local_index = group_start + k * blockDim.x + threadIdx.x;
1219 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1220 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1221 d_mat_row_start, d_mat_row_inc,
1222 d_mat_col_start, d_mat_col_inc,
1223 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1226 if (threadIdx.x == 0 && k > 0)
1228 if (tmp.x == shared_rows[blockDim.x-1])
1229 val += inter_results[blockDim.x-1];
1231 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1232 result_row_start, result_row_inc,
1233 result_col_start, result_col_inc,
1234 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1239 shared_rows[threadIdx.x] = tmp.x;
1240 inter_results[threadIdx.x] = val;
1246 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1248 inter_results[threadIdx.x] += left;
1253 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1254 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1256 result[ResultIndexT::apply(tmp.x, result_col,
1257 result_row_start, result_row_inc,
1258 result_col_start, result_col_inc,
1259 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1265 if (local_index + 1 == group_end)
1266 result[ResultIndexT::apply(tmp.x, result_col,
1267 result_row_start, result_row_inc,
1268 result_col_start, result_col_inc,
1269 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1282 template<
typename NumericT,
unsigned int AlignmentV>
1290 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1291 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1292 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1294 detail::cuda_arg<NumericT>(d_mat),
1300 detail::cuda_arg<NumericT>(result),
1311 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1312 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1313 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1315 detail::cuda_arg<NumericT>(d_mat),
1321 detail::cuda_arg<NumericT>(result),
1332 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1333 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1334 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1336 detail::cuda_arg<NumericT>(d_mat),
1342 detail::cuda_arg<NumericT>(result),
1353 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1354 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1355 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1357 detail::cuda_arg<NumericT>(d_mat),
1363 detail::cuda_arg<NumericT>(result),
1374 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1376 const NumericT * elements,
1377 const unsigned int * group_boundaries,
1378 const NumericT * d_mat,
1379 unsigned int d_mat_row_start,
1380 unsigned int d_mat_col_start,
1381 unsigned int d_mat_row_inc,
1382 unsigned int d_mat_col_inc,
1383 unsigned int d_mat_row_size,
1384 unsigned int d_mat_col_size,
1385 unsigned int d_mat_internal_rows,
1386 unsigned int d_mat_internal_cols,
1388 unsigned int result_row_start,
1389 unsigned int result_col_start,
1390 unsigned int result_row_inc,
1391 unsigned int result_col_inc,
1392 unsigned int result_row_size,
1393 unsigned int result_col_size,
1394 unsigned int result_internal_rows,
1395 unsigned int result_internal_cols)
1397 __shared__
unsigned int shared_rows[128];
1398 __shared__ NumericT inter_results[128];
1402 unsigned int group_start = group_boundaries[blockIdx.x];
1403 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1404 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1406 unsigned int local_index = 0;
1408 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1410 for (
unsigned int k = 0; k < k_end; ++k)
1412 local_index = group_start + k * blockDim.x + threadIdx.x;
1414 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1415 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1416 d_mat_row_start, d_mat_row_inc,
1417 d_mat_col_start, d_mat_col_inc,
1418 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1421 if (threadIdx.x == 0 && k > 0)
1423 if (tmp.x == shared_rows[blockDim.x-1])
1424 val += inter_results[blockDim.x-1];
1426 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1427 result_row_start, result_row_inc,
1428 result_col_start, result_col_inc,
1429 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1434 shared_rows[threadIdx.x] = tmp.x;
1435 inter_results[threadIdx.x] = val;
1441 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1443 inter_results[threadIdx.x] += left;
1448 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1449 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1451 result[ ResultIndexT::apply(tmp.x, result_col,
1452 result_row_start, result_row_inc,
1453 result_col_start, result_col_inc,
1454 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1460 if (local_index + 1 == group_end)
1461 result[ ResultIndexT::apply(tmp.x, result_col,
1462 result_row_start, result_row_inc,
1463 result_col_start, result_col_inc,
1464 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1476 template<
typename NumericT,
unsigned int AlignmentV>
1483 if (d_mat.lhs().row_major() && result.
row_major())
1486 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1487 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1488 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1490 detail::cuda_arg<NumericT>(d_mat.lhs()),
1496 detail::cuda_arg<NumericT>(result),
1504 else if (d_mat.lhs().row_major() && !result.
row_major())
1507 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1508 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1509 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1511 detail::cuda_arg<NumericT>(d_mat.lhs()),
1517 detail::cuda_arg<NumericT>(result),
1525 else if (!d_mat.lhs().row_major() && result.
row_major())
1528 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1529 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1530 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1532 detail::cuda_arg<NumericT>(d_mat.lhs()),
1538 detail::cuda_arg<NumericT>(result),
1549 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1550 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1551 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1553 detail::cuda_arg<NumericT>(d_mat.lhs()),
1559 detail::cuda_arg<NumericT>(result),
1574 template<
typename NumericT>
1576 const NumericT * elements,
1578 unsigned int start_x,
1581 unsigned int start_result,
1582 unsigned int inc_result,
1583 unsigned int row_num,
1584 unsigned int col_num,
1585 unsigned int internal_row_num,
1586 unsigned int items_per_row,
1587 unsigned int aligned_items_per_row
1590 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1591 unsigned int glb_sz = gridDim.x * blockDim.x;
1593 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1597 unsigned int offset = row_id;
1598 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1600 NumericT val = elements[offset];
1602 if (val != NumericT(0))
1604 int col = coords[offset];
1605 sum += x[col * inc_x + start_x] * val;
1609 result[row_id * inc_result + start_result] =
sum;
1622 template<
typename NumericT,
unsigned int AlignmentV>
1627 ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1628 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1629 detail::cuda_arg<NumericT>(vec),
1630 static_cast<unsigned int>(vec.
start()),
1631 static_cast<unsigned int>(vec.
stride()),
1632 detail::cuda_arg<NumericT>(result),
1633 static_cast<unsigned int>(result.
start()),
1634 static_cast<unsigned int>(result.
stride()),
1635 static_cast<unsigned int>(mat.
size1()),
1636 static_cast<unsigned int>(mat.
size2()),
1638 static_cast<unsigned int>(mat.
maxnnz()),
1644 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1646 const NumericT * sp_mat_elements,
1647 unsigned int sp_mat_row_num,
1648 unsigned int sp_mat_col_num,
1649 unsigned int sp_mat_internal_row_num,
1650 unsigned int sp_mat_items_per_row,
1651 unsigned int sp_mat_aligned_items_per_row,
1652 const NumericT * d_mat,
1653 unsigned int d_mat_row_start,
1654 unsigned int d_mat_col_start,
1655 unsigned int d_mat_row_inc,
1656 unsigned int d_mat_col_inc,
1657 unsigned int d_mat_row_size,
1658 unsigned int d_mat_col_size,
1659 unsigned int d_mat_internal_rows,
1660 unsigned int d_mat_internal_cols,
1662 unsigned int result_row_start,
1663 unsigned int result_col_start,
1664 unsigned int result_row_inc,
1665 unsigned int result_col_inc,
1666 unsigned int result_row_size,
1667 unsigned int result_col_size,
1668 unsigned int result_internal_rows,
1669 unsigned int result_internal_cols)
1671 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1672 unsigned int glb_sz = gridDim.x * blockDim.x;
1674 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1676 unsigned int row = rc % sp_mat_row_num;
1677 unsigned int col = rc / sp_mat_row_num;
1679 unsigned int offset =
row;
1680 NumericT r = (NumericT)0;
1682 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1684 unsigned int j = sp_mat_coords[offset];
1685 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1687 if (x != (NumericT)0)
1689 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1690 d_mat_row_start, d_mat_row_inc,
1691 d_mat_col_start, d_mat_col_inc,
1692 d_mat_internal_rows, d_mat_internal_cols) ];
1697 result [ ResultIndexT::apply(row, col,
1698 result_row_start, result_row_inc,
1699 result_col_start, result_col_inc,
1700 result_internal_rows, result_internal_cols) ] = r;
1714 template<
typename NumericT,
unsigned int AlignmentV>
1722 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1723 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1724 static_cast<unsigned int>(sp_mat.
size1()),
1725 static_cast<unsigned int>(sp_mat.
size2()),
1727 static_cast<unsigned int>(sp_mat.
maxnnz()),
1729 detail::cuda_arg<NumericT>(d_mat),
1735 detail::cuda_arg<NumericT>(result),
1746 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1747 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1748 static_cast<unsigned int>(sp_mat.
size1()),
1749 static_cast<unsigned int>(sp_mat.
size2()),
1751 static_cast<unsigned int>(sp_mat.
maxnnz()),
1753 detail::cuda_arg<NumericT>(d_mat),
1759 detail::cuda_arg<NumericT>(result),
1770 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1771 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1772 static_cast<unsigned int>(sp_mat.
size1()),
1773 static_cast<unsigned int>(sp_mat.
size2()),
1775 static_cast<unsigned int>(sp_mat.
maxnnz()),
1777 detail::cuda_arg<NumericT>(d_mat),
1783 detail::cuda_arg<NumericT>(result),
1794 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1795 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1796 static_cast<unsigned int>(sp_mat.
size1()),
1797 static_cast<unsigned int>(sp_mat.
size2()),
1799 static_cast<unsigned int>(sp_mat.
maxnnz()),
1801 detail::cuda_arg<NumericT>(d_mat),
1807 detail::cuda_arg<NumericT>(result),
1817 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT >
1819 const NumericT * sp_mat_elements,
1820 unsigned int sp_mat_row_num,
1821 unsigned int sp_mat_col_num,
1822 unsigned int sp_mat_internal_row_num,
1823 unsigned int sp_mat_items_per_row,
1824 unsigned int sp_mat_aligned_items_per_row,
1825 const NumericT * d_mat,
1826 unsigned int d_mat_row_start,
1827 unsigned int d_mat_col_start,
1828 unsigned int d_mat_row_inc,
1829 unsigned int d_mat_col_inc,
1830 unsigned int d_mat_row_size,
1831 unsigned int d_mat_col_size,
1832 unsigned int d_mat_internal_rows,
1833 unsigned int d_mat_internal_cols,
1835 unsigned int result_row_start,
1836 unsigned int result_col_start,
1837 unsigned int result_row_inc,
1838 unsigned int result_col_inc,
1839 unsigned int result_row_size,
1840 unsigned int result_col_size,
1841 unsigned int result_internal_rows,
1842 unsigned int result_internal_cols)
1844 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1845 unsigned int glb_sz = gridDim.x * blockDim.x;
1847 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1849 unsigned int row = rc % sp_mat_row_num;
1850 unsigned int col = rc / sp_mat_row_num;
1852 unsigned int offset =
row;
1853 NumericT r = (NumericT)0;
1855 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1857 unsigned int j = sp_mat_coords[offset];
1858 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1860 if (x != (NumericT)0)
1862 NumericT y = d_mat[ DMatIndexT::apply(col, j,
1863 d_mat_row_start, d_mat_row_inc,
1864 d_mat_col_start, d_mat_col_inc,
1865 d_mat_internal_rows, d_mat_internal_cols) ];
1870 result [ ResultIndexT::apply(row, col,
1871 result_row_start, result_row_inc,
1872 result_col_start, result_col_inc,
1873 result_internal_rows, result_internal_cols) ] = r;
1887 template<
typename NumericT,
unsigned int AlignmentV>
1894 if (d_mat.lhs().row_major() && result.
row_major())
1897 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1898 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1899 static_cast<unsigned int>(sp_mat.
size1()),
1900 static_cast<unsigned int>(sp_mat.
size2()),
1902 static_cast<unsigned int>(sp_mat.
maxnnz()),
1905 detail::cuda_arg<NumericT>(d_mat.lhs()),
1911 detail::cuda_arg<NumericT>(result),
1919 else if (d_mat.lhs().row_major() && !result.
row_major())
1922 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1923 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1924 static_cast<unsigned int>(sp_mat.
size1()),
1925 static_cast<unsigned int>(sp_mat.
size2()),
1927 static_cast<unsigned int>(sp_mat.
maxnnz()),
1930 detail::cuda_arg<NumericT>(d_mat.lhs()),
1936 detail::cuda_arg<NumericT>(result),
1944 else if (!d_mat.lhs().row_major() && result.
row_major())
1947 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1948 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1949 static_cast<unsigned int>(sp_mat.
size1()),
1950 static_cast<unsigned int>(sp_mat.
size2()),
1952 static_cast<unsigned int>(sp_mat.
maxnnz()),
1955 detail::cuda_arg<NumericT>(d_mat.lhs()),
1961 detail::cuda_arg<NumericT>(result),
1972 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1973 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1974 static_cast<unsigned int>(sp_mat.
size1()),
1975 static_cast<unsigned int>(sp_mat.
size2()),
1977 static_cast<unsigned int>(sp_mat.
maxnnz()),
1980 detail::cuda_arg<NumericT>(d_mat.lhs()),
1986 detail::cuda_arg<NumericT>(result),
2000 template<
typename NumericT>
2002 const unsigned int * column_indices,
2003 const unsigned int * block_start,
2004 const NumericT * elements,
2006 unsigned int start_x,
2008 unsigned int size_x,
2010 unsigned int start_result,
2011 unsigned int inc_result,
2012 unsigned int size_result)
2014 unsigned int local_id = threadIdx.x;
2015 unsigned int local_size = blockDim.x;
2016 unsigned int num_rows = size_result;
2018 for (
unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
2020 unsigned int row = block_idx * local_size + local_id;
2021 unsigned int offset = block_start[block_idx];
2022 unsigned int num_columns = columns_per_block[block_idx];
2025 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
2027 unsigned int index = offset + item_id * local_size + local_id;
2028 NumericT val = elements[index];
2030 sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2034 result[row * inc_result + start_result] =
sum;
2046 template<
typename NumericT,
typename IndexT>
2052 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2053 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2054 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2055 detail::cuda_arg<NumericT>(vec),
2056 static_cast<unsigned int>(vec.
start()),
2057 static_cast<unsigned int>(vec.
stride()),
2058 static_cast<unsigned int>(vec.
size()),
2059 detail::cuda_arg<NumericT>(result),
2060 static_cast<unsigned int>(result.
start()),
2061 static_cast<unsigned int>(result.
stride()),
2062 static_cast<unsigned int>(result.
size())
2073 template<
typename NumericT>
2075 const NumericT * ell_elements,
2076 const unsigned int * csr_rows,
2077 const unsigned int * csr_cols,
2078 const NumericT * csr_elements,
2080 unsigned int start_x,
2083 unsigned int start_result,
2084 unsigned int inc_result,
2085 unsigned int row_num,
2086 unsigned int internal_row_num,
2087 unsigned int items_per_row,
2088 unsigned int aligned_items_per_row
2091 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2092 unsigned int glb_sz = gridDim.x * blockDim.x;
2094 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2098 unsigned int offset = row_id;
2099 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2101 NumericT val = ell_elements[offset];
2104 if (val != NumericT(0))
2106 int col = ell_coords[offset];
2107 sum += (x[col * inc_x + start_x] * val);
2111 unsigned int col_begin = csr_rows[row_id];
2112 unsigned int col_end = csr_rows[row_id + 1];
2114 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2115 sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2117 result[row_id * inc_result + start_result] =
sum;
2131 template<
typename NumericT,
unsigned int AlignmentV>
2136 hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2137 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2138 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2139 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2140 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2141 detail::cuda_arg<NumericT>(vec),
2142 static_cast<unsigned int>(vec.
start()),
2143 static_cast<unsigned int>(vec.
stride()),
2144 detail::cuda_arg<NumericT>(result),
2145 static_cast<unsigned int>(result.
start()),
2146 static_cast<unsigned int>(result.
stride()),
2147 static_cast<unsigned int>(mat.
size1()),
2149 static_cast<unsigned int>(mat.
ell_nnz()),
2157 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2159 const NumericT * ell_elements,
2160 const unsigned int * csr_rows,
2161 const unsigned int * csr_cols,
2162 const NumericT * csr_elements,
2163 unsigned int row_num,
2164 unsigned int internal_row_num,
2165 unsigned int items_per_row,
2166 unsigned int aligned_items_per_row,
2167 const NumericT * d_mat,
2168 unsigned int d_mat_row_start,
2169 unsigned int d_mat_col_start,
2170 unsigned int d_mat_row_inc,
2171 unsigned int d_mat_col_inc,
2172 unsigned int d_mat_row_size,
2173 unsigned int d_mat_col_size,
2174 unsigned int d_mat_internal_rows,
2175 unsigned int d_mat_internal_cols,
2177 unsigned int result_row_start,
2178 unsigned int result_col_start,
2179 unsigned int result_row_inc,
2180 unsigned int result_col_inc,
2181 unsigned int result_row_size,
2182 unsigned int result_col_size,
2183 unsigned int result_internal_rows,
2184 unsigned int result_internal_cols)
2186 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2187 unsigned int glb_sz = gridDim.x * blockDim.x;
2189 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2191 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2195 unsigned int offset = row_id;
2196 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2198 NumericT val = ell_elements[offset];
2202 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2203 d_mat_row_start, d_mat_row_inc,
2204 d_mat_col_start, d_mat_col_inc,
2205 d_mat_internal_rows, d_mat_internal_cols)] * val;
2209 unsigned int col_begin = csr_rows[row_id];
2210 unsigned int col_end = csr_rows[row_id + 1];
2212 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2214 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2215 d_mat_row_start, d_mat_row_inc,
2216 d_mat_col_start, d_mat_col_inc,
2217 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2220 result[ResultIndexT::apply(row_id, result_col,
2221 result_row_start, result_row_inc,
2222 result_col_start, result_col_inc,
2223 result_internal_rows, result_internal_cols)] =
sum;
2238 template<
typename NumericT,
unsigned int AlignmentV>
2246 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2247 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2248 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2249 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2250 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2251 static_cast<unsigned int>(mat.
size1()),
2253 static_cast<unsigned int>(mat.
ell_nnz()),
2256 detail::cuda_arg<NumericT>(d_mat),
2262 detail::cuda_arg<NumericT>(result),
2273 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2274 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2275 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2276 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2277 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2278 static_cast<unsigned int>(mat.
size1()),
2280 static_cast<unsigned int>(mat.
ell_nnz()),
2283 detail::cuda_arg<NumericT>(d_mat),
2289 detail::cuda_arg<NumericT>(result),
2300 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2301 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2302 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2303 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2304 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2305 static_cast<unsigned int>(mat.
size1()),
2307 static_cast<unsigned int>(mat.
ell_nnz()),
2310 detail::cuda_arg<NumericT>(d_mat),
2316 detail::cuda_arg<NumericT>(result),
2327 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2328 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2329 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2330 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2331 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2332 static_cast<unsigned int>(mat.
size1()),
2334 static_cast<unsigned int>(mat.
ell_nnz()),
2337 detail::cuda_arg<NumericT>(d_mat),
2343 detail::cuda_arg<NumericT>(result),
2355 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2357 const NumericT * ell_elements,
2358 const unsigned int * csr_rows,
2359 const unsigned int * csr_cols,
2360 const NumericT * csr_elements,
2361 unsigned int row_num,
2362 unsigned int internal_row_num,
2363 unsigned int items_per_row,
2364 unsigned int aligned_items_per_row,
2365 const NumericT * d_mat,
2366 unsigned int d_mat_row_start,
2367 unsigned int d_mat_col_start,
2368 unsigned int d_mat_row_inc,
2369 unsigned int d_mat_col_inc,
2370 unsigned int d_mat_row_size,
2371 unsigned int d_mat_col_size,
2372 unsigned int d_mat_internal_rows,
2373 unsigned int d_mat_internal_cols,
2375 unsigned int result_row_start,
2376 unsigned int result_col_start,
2377 unsigned int result_row_inc,
2378 unsigned int result_col_inc,
2379 unsigned int result_row_size,
2380 unsigned int result_col_size,
2381 unsigned int result_internal_rows,
2382 unsigned int result_internal_cols)
2384 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2385 unsigned int glb_sz = gridDim.x * blockDim.x;
2387 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2389 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2393 unsigned int offset = row_id;
2394 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2396 NumericT val = ell_elements[offset];
2400 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2401 d_mat_row_start, d_mat_row_inc,
2402 d_mat_col_start, d_mat_col_inc,
2403 d_mat_internal_rows, d_mat_internal_cols)] * val;
2407 unsigned int col_begin = csr_rows[row_id];
2408 unsigned int col_end = csr_rows[row_id + 1];
2410 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2412 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2413 d_mat_row_start, d_mat_row_inc,
2414 d_mat_col_start, d_mat_col_inc,
2415 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2418 result[ResultIndexT::apply(row_id, result_col,
2419 result_row_start, result_row_inc,
2420 result_col_start, result_col_inc,
2421 result_internal_rows, result_internal_cols)] =
sum;
2436 template<
typename NumericT,
unsigned int AlignmentV>
2443 if (d_mat.lhs().row_major() && result.
row_major())
2446 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2447 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2448 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2449 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2450 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2451 static_cast<unsigned int>(mat.
size1()),
2453 static_cast<unsigned int>(mat.
ell_nnz()),
2456 detail::cuda_arg<NumericT>(d_mat.lhs()),
2462 detail::cuda_arg<NumericT>(result),
2470 else if (d_mat.lhs().row_major() && !result.
row_major())
2473 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2474 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2475 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2476 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2477 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2478 static_cast<unsigned int>(mat.
size1()),
2480 static_cast<unsigned int>(mat.
ell_nnz()),
2483 detail::cuda_arg<NumericT>(d_mat.lhs()),
2489 detail::cuda_arg<NumericT>(result),
2497 else if (!d_mat.lhs().row_major() && result.
row_major())
2500 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2501 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2502 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2503 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2504 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2505 static_cast<unsigned int>(mat.
size1()),
2507 static_cast<unsigned int>(mat.
ell_nnz()),
2510 detail::cuda_arg<NumericT>(d_mat.lhs()),
2516 detail::cuda_arg<NumericT>(result),
2527 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2528 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2529 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2530 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2531 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2532 static_cast<unsigned int>(mat.
size1()),
2534 static_cast<unsigned int>(mat.
ell_nnz()),
2537 detail::cuda_arg<NumericT>(d_mat.lhs()),
2543 detail::cuda_arg<NumericT>(result),
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern.
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
const handle_type & handle4() const
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
vcl_size_t internal_ellnnz() const
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
vcl_size_t size1() const
Returns the size of the result vector.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
vcl_size_t ell_nnz() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
const handle_type & handle2() const
A tag class representing an upper triangular matrix.
vcl_size_t internal_size1() const
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
const handle_type & handle() const
A tag class representing a lower triangular matrix with unit diagonal.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix.
LHS & lhs() const
Get left hand side operand.
size_type start() const
Returns the offset within the buffer.
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
const handle_type & handle3() const
A tag class representing an upper triangular matrix with unit diagonal.
const handle_type & handle5() const
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)