1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
40 template<
typename NumericT>
43 unsigned int A_start1,
unsigned int A_start2,
44 unsigned int A_inc1,
unsigned int A_inc2,
45 unsigned int A_size1,
unsigned int A_size2,
46 unsigned int A_internal_size1,
unsigned int A_internal_size2,
51 unsigned int B_start1,
unsigned int B_start2,
52 unsigned int B_inc1,
unsigned int B_inc2,
53 unsigned int B_size1,
unsigned int B_size2,
54 unsigned int B_internal_size1,
unsigned int B_internal_size2,
63 for (
unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
65 unsigned int row = A_size1 - 1 - row_cnt;
73 if (row_major_B && transpose_B)
74 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
75 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
76 else if (row_major_B && !transpose_B)
77 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
78 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
79 else if (!row_major_B && transpose_B)
80 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
81 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
83 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
84 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
90 if (row_major_B && transpose_B)
91 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
92 else if (row_major_B && !transpose_B)
93 temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
94 else if (!row_major_B && transpose_B)
95 temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
97 temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
100 for (
unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
102 if (row_major_A && transpose_A)
103 entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
104 else if (row_major_A && !transpose_A)
105 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
106 else if (!row_major_A && transpose_A)
107 entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
109 entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
111 if (row_major_B && transpose_B)
112 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
113 else if (row_major_B && !transpose_B)
114 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
115 else if (!row_major_B && transpose_B)
116 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
118 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
126 template<
typename NumericT>
129 unsigned int A_start1,
unsigned int A_start2,
130 unsigned int A_inc1,
unsigned int A_inc2,
131 unsigned int A_size1,
unsigned int A_size2,
132 unsigned int A_internal_size1,
unsigned int A_internal_size2,
137 unsigned int B_start1,
unsigned int B_start2,
138 unsigned int B_inc1,
unsigned int B_inc2,
139 unsigned int B_size1,
unsigned int B_size2,
140 unsigned int B_internal_size1,
unsigned int B_internal_size2,
149 for (
unsigned int row = 0;
row < A_size1; ++
row)
156 if (threadIdx.x == 0)
158 if (row_major_B && transpose_B)
159 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (
row * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
160 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
161 else if (row_major_B && !transpose_B)
162 B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
163 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
164 else if (!row_major_B && transpose_B)
165 B[(blockIdx.x * B_inc1 + B_start1) + (
row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
166 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
168 B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
169 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
175 if (row_major_B && transpose_B)
176 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (
row * B_inc2 + B_start2)];
177 else if (row_major_B && !transpose_B)
178 temp = B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
179 else if (!row_major_B && transpose_B)
180 temp = B[(blockIdx.x * B_inc1 + B_start1) + (
row * B_inc2 + B_start2) * B_internal_size1];
182 temp = B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
185 for (
unsigned int elim =
row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
187 if (row_major_A && transpose_A)
188 entry_A = A[(
row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
189 else if (row_major_A && !transpose_A)
190 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)];
191 else if (!row_major_A && transpose_A)
192 entry_A = A[(
row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
194 entry_A = A[(elim * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1];
196 if (row_major_B && transpose_B)
197 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
198 else if (row_major_B && !transpose_B)
199 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
200 else if (!row_major_B && transpose_B)
201 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
203 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
216 template<
typename TagT>
222 template<
typename TagT>
228 template<
typename Matrix1T,
typename Matrix2T,
typename SolverTagT>
230 Matrix2T & B,
bool transpose_B,
231 SolverTagT
const & tag)
236 dim3 grid( transpose_B ? B.size1() : B.size2() );
240 matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
248 detail::cuda_arg<value_type>(B),
261 matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
269 detail::cuda_arg<value_type>(B),
298 template<
typename NumericT,
typename SolverTagT>
312 template<
typename NumericT>
315 unsigned int A_start1,
unsigned int A_start2,
316 unsigned int A_inc1,
unsigned int A_inc2,
317 unsigned int A_size1,
unsigned int A_size2,
318 unsigned int A_internal_size1,
unsigned int A_internal_size2,
320 unsigned int v_start,
324 unsigned int options)
327 unsigned int unit_diagonal_flag = (options & (1 << 0));
328 unsigned int transposed_access_A = (options & (1 << 1));
329 unsigned int is_lower_solve = (options & (1 << 2));
331 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
333 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
334 if (!unit_diagonal_flag)
337 if (threadIdx.x == 0)
338 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
343 temp = v[row * v_inc + v_start];
345 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
346 elim < (is_lower_solve ? A_size1 : row);
348 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2))
349 : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))];
354 template<
typename NumericT>
357 unsigned int A_start1,
unsigned int A_start2,
358 unsigned int A_inc1,
unsigned int A_inc2,
359 unsigned int A_size1,
unsigned int A_size2,
360 unsigned int A_internal_size1,
unsigned int A_internal_size2,
362 unsigned int v_start,
365 unsigned int options)
368 unsigned int unit_diagonal_flag = (options & (1 << 0));
369 unsigned int transposed_access_A = (options & (1 << 1));
370 unsigned int is_lower_solve = (options & (1 << 2));
372 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
374 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
375 if (!unit_diagonal_flag)
378 if (threadIdx.x == 0)
379 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
384 temp = v[row * v_inc + v_start];
386 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
387 elim < (is_lower_solve ? A_size1 : row);
389 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1)
390 : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)];
402 template<
typename MatrixT,
typename VectorT>
405 unsigned int options)
411 triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
416 detail::cuda_arg<value_type>(vec),
425 triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
430 detail::cuda_arg<value_type>(vec),
447 template<
typename NumericT,
typename SolverTagT>
void inplace_solve_impl(Matrix1T const &A, bool transpose_A, Matrix2T &B, bool transpose_B, SolverTagT const &tag)
void inplace_solve_vector_impl(MatrixT const &mat, VectorT &vec, unsigned int options)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void triangular_substitute_inplace_row_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Implementation of the dense matrix class.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
__global__ void matrix_matrix_upper_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
bool is_unit_solve(TagT const &tag)
__global__ void matrix_matrix_lower_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
A tag class representing an upper triangular matrix.
void inplace_solve(const matrix_base< NumericT > &A, bool trans_A, matrix_base< NumericT > &B, bool trans_B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< T >::type start(T const &obj)
__global__ void triangular_substitute_inplace_col_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal.
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
A tag class representing an upper triangular matrix with unit diagonal.
bool is_upper_solve(TagT const &tag)