ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
32 
33 namespace viennacl
34 {
35 namespace linalg
36 {
37 namespace cuda
38 {
39 //
40 // Compressed matrix
41 //
42 
43 namespace detail
44 {
45 
46  template<typename NumericT>
48  const unsigned int * row_indices,
49  const unsigned int * column_indices,
50  const NumericT * elements,
51  NumericT * result,
52  unsigned int size,
53  unsigned int option)
54  {
55  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
56  row < size;
57  row += gridDim.x * blockDim.x)
58  {
59  NumericT value = 0;
60  unsigned int row_end = row_indices[row+1];
61 
62  switch (option)
63  {
64  case 0: //inf-norm
65  for (unsigned int i = row_indices[row]; i < row_end; ++i)
66  value = max(value, fabs(elements[i]));
67  break;
68 
69  case 1: //1-norm
70  for (unsigned int i = row_indices[row]; i < row_end; ++i)
71  value += fabs(elements[i]);
72  break;
73 
74  case 2: //2-norm
75  for (unsigned int i = row_indices[row]; i < row_end; ++i)
76  value += elements[i] * elements[i];
77  value = sqrt(value);
78  break;
79 
80  case 3: //diagonal entry
81  for (unsigned int i = row_indices[row]; i < row_end; ++i)
82  {
83  if (column_indices[i] == row)
84  {
85  value = elements[i];
86  break;
87  }
88  }
89  break;
90 
91  default:
92  break;
93  }
94  result[row] = value;
95  }
96  }
97 
98 
99  template<typename NumericT, unsigned int AligmentV>
101  vector_base<NumericT> & vec,
103  {
104  csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
105  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
106  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
107  detail::cuda_arg<NumericT>(vec),
108  static_cast<unsigned int>(mat.size1()),
109  static_cast<unsigned int>(info_selector)
110  );
111  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
112  }
113 
114 } //namespace detail
115 
116 
117 template<typename NumericT>
119  const unsigned int * row_indices,
120  const unsigned int * column_indices,
121  const NumericT * elements,
122  const NumericT * x,
123  unsigned int start_x,
124  unsigned int inc_x,
125  NumericT * result,
126  unsigned int start_result,
127  unsigned int inc_result,
128  unsigned int size_result)
129 {
130  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
131  row < size_result;
132  row += gridDim.x * blockDim.x)
133  {
134  NumericT dot_prod = NumericT(0);
135  unsigned int row_end = row_indices[row+1];
136  for (unsigned int i = row_indices[row]; i < row_end; ++i)
137  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138  result[row * inc_result + start_result] = dot_prod;
139  }
140 }
141 
142 
143 
144 
145 
154 template<class NumericT, unsigned int AlignmentV>
158 {
159  compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
160  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
161  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
162  detail::cuda_arg<NumericT>(vec),
163  static_cast<unsigned int>(vec.start()),
164  static_cast<unsigned int>(vec.stride()),
165  detail::cuda_arg<NumericT>(result),
166  static_cast<unsigned int>(result.start()),
167  static_cast<unsigned int>(result.stride()),
168  static_cast<unsigned int>(result.size())
169  );
170  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
171 }
172 
177 template<typename LayoutT>
179 {
180  static __device__ unsigned int apply(unsigned int i, unsigned int j,
181  unsigned int row_start, unsigned int row_inc,
182  unsigned int col_start, unsigned int col_inc,
183  unsigned int internal_rows, unsigned int internal_cols)
184  {
185  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
186  }
187 };
188 
190 template<>
191 struct mat_mult_matrix_index<viennacl::column_major>
192 {
193  static __device__ unsigned int apply(unsigned int i, unsigned int j,
194  unsigned int row_start, unsigned int row_inc,
195  unsigned int col_start, unsigned int col_inc,
196  unsigned int internal_rows, unsigned int internal_cols)
197  {
198  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
199  }
200 };
204 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
206  const unsigned int * sp_mat_row_indices,
207  const unsigned int * sp_mat_col_indices,
208  const NumericT * sp_mat_elements,
209  const NumericT * d_mat,
210  unsigned int d_mat_row_start,
211  unsigned int d_mat_col_start,
212  unsigned int d_mat_row_inc,
213  unsigned int d_mat_col_inc,
214  unsigned int d_mat_row_size,
215  unsigned int d_mat_col_size,
216  unsigned int d_mat_internal_rows,
217  unsigned int d_mat_internal_cols,
218  NumericT * result,
219  unsigned int result_row_start,
220  unsigned int result_col_start,
221  unsigned int result_row_inc,
222  unsigned int result_col_inc,
223  unsigned int result_row_size,
224  unsigned int result_col_size,
225  unsigned int result_internal_rows,
226  unsigned int result_internal_cols)
227 {
228  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
229  {
230  unsigned int row_start = sp_mat_row_indices[row];
231  unsigned int row_end = sp_mat_row_indices[row+1];
232 
233  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
234  {
235  NumericT r = 0;
236 
237  for (unsigned int k = row_start; k < row_end; k++)
238  {
239  unsigned int j = sp_mat_col_indices[k];
240  NumericT x = sp_mat_elements[k];
241  NumericT y = d_mat[ DMatIndexT::apply(j, col,
242  d_mat_row_start, d_mat_row_inc,
243  d_mat_col_start, d_mat_col_inc,
244  d_mat_internal_rows, d_mat_internal_cols) ];
245 
246  r += x * y;
247  }
248 
249  result[ResultIndexT::apply(row, col,
250  result_row_start, result_row_inc,
251  result_col_start, result_col_inc,
252  result_internal_rows, result_internal_cols)] = r;
253  }
254  }
255 }
256 
257 
266 template<typename NumericT, unsigned int AlignmentV>
268  const viennacl::matrix_base<NumericT> & d_mat,
270 {
271  if (d_mat.row_major() && result.row_major())
272  {
273  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
274  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
275  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
276  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
277 
278  detail::cuda_arg<NumericT>(d_mat),
279  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
280  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
281  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
282  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
283 
284  detail::cuda_arg<NumericT>(result),
285  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
286  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
287  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
288  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
289  );
290  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
291  }
292  else if (d_mat.row_major() && !result.row_major())
293  {
294  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
295  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
296  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
297  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
298 
299  detail::cuda_arg<NumericT>(d_mat),
300  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
301  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
302  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
303  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
304 
305  detail::cuda_arg<NumericT>(result),
306  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
307  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
308  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
309  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
310  );
311  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
312  }
313  else if (!d_mat.row_major() && result.row_major())
314  {
315  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
316  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
317  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
318  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
319 
320  detail::cuda_arg<NumericT>(d_mat),
321  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
322  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
323  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
324  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
325 
326  detail::cuda_arg<NumericT>(result),
327  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
328  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
329  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
330  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
331  );
332  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
333  }
334  else
335  {
336  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
337  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
338  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
339  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
340 
341  detail::cuda_arg<NumericT>(d_mat),
342  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
343  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
344  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
345  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
346 
347  detail::cuda_arg<NumericT>(result),
348  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
349  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
350  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
351  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
352  );
353  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
354  }
355 }
356 
357 
358 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
360  const unsigned int * sp_mat_row_indices,
361  const unsigned int * sp_mat_col_indices,
362  const NumericT * sp_mat_elements,
363  const NumericT * d_mat,
364  unsigned int d_mat_row_start,
365  unsigned int d_mat_col_start,
366  unsigned int d_mat_row_inc,
367  unsigned int d_mat_col_inc,
368  unsigned int d_mat_row_size,
369  unsigned int d_mat_col_size,
370  unsigned int d_mat_internal_rows,
371  unsigned int d_mat_internal_cols,
372  NumericT * result,
373  unsigned int result_row_start,
374  unsigned int result_col_start,
375  unsigned int result_row_inc,
376  unsigned int result_col_inc,
377  unsigned int result_row_size,
378  unsigned int result_col_size,
379  unsigned int result_internal_rows,
380  unsigned int result_internal_cols)
381 {
382  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
383  {
384  unsigned int row_start = sp_mat_row_indices[row];
385  unsigned int row_end = sp_mat_row_indices[row+1];
386 
387  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
388  {
389  NumericT r = 0;
390 
391  for (unsigned int k = row_start; k < row_end; k++)
392  {
393  unsigned int j = sp_mat_col_indices[k];
394  NumericT x = sp_mat_elements[k];
395  NumericT y = d_mat[ DMatIndexT::apply(col, j,
396  d_mat_row_start, d_mat_row_inc,
397  d_mat_col_start, d_mat_col_inc,
398  d_mat_internal_rows, d_mat_internal_cols) ];
399 
400  r += x * y;
401  }
402 
403  result [ ResultIndexT::apply(row, col,
404  result_row_start, result_row_inc,
405  result_col_start, result_col_inc,
406  result_internal_rows, result_internal_cols) ] = r;
407  }
408  }
409 
410 }
411 
421 template<typename NumericT, unsigned int AlignmentV>
425  viennacl::op_trans > & d_mat,
427 {
428 
429  if (d_mat.lhs().row_major() && result.row_major())
430  {
431  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
432  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
433  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
434  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
435 
436  detail::cuda_arg<NumericT>(d_mat.lhs()),
437  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
438  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
439  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
440  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
441 
442  detail::cuda_arg<NumericT>(result),
443  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
444  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
445  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
446  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
447  );
448  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
449  }
450  else if (d_mat.lhs().row_major() && !result.row_major())
451  {
452  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
453  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
454  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
455  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
456 
457  detail::cuda_arg<NumericT>(d_mat.lhs()),
458  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
459  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
460  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
461  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
462 
463  detail::cuda_arg<NumericT>(result),
464  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
465  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
466  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
467  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
468  );
469  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
470  }
471  else if (!d_mat.lhs().row_major() && result.row_major())
472  {
473  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
474  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
475  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
476  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
477 
478  detail::cuda_arg<NumericT>(d_mat.lhs()),
479  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
480  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
481  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
482  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
483 
484  detail::cuda_arg<NumericT>(result),
485  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
486  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
487  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
488  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
489  );
490  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
491  }
492  else
493  {
494  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
495  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
496  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
497  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
498 
499  detail::cuda_arg<NumericT>(d_mat.lhs()),
500  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
501  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
502  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
503  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
504 
505  detail::cuda_arg<NumericT>(result),
506  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
507  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
508  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
509  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
510  );
511  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
512  }
513 }
514 
515 
516 //
517 // triangular solves for compressed_matrix
518 //
519 
520 template<typename NumericT>
522  const unsigned int * row_indices,
523  const unsigned int * column_indices,
524  const NumericT * elements,
525  NumericT * result,
526  unsigned int size)
527 {
528  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
529  row < size;
530  row += gridDim.x * blockDim.x)
531  {
532  NumericT diag = NumericT(0);
533  unsigned int row_end = row_indices[row+1];
534  for (unsigned int i = row_indices[row]; i < row_end; ++i)
535  {
536  unsigned int col_index = column_indices[i];
537  if (col_index == row)
538  {
539  diag = elements[i];
540  break;
541  }
542  }
543  result[row] = diag;
544  }
545 }
546 
547 
553 template<typename SparseMatrixT, typename NumericT>
555 inplace_solve(const SparseMatrixT & mat,
558 {
559  csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
560  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
561  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
562  detail::cuda_arg<NumericT>(vec),
563  static_cast<unsigned int>(mat.size1())
564  );
565  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
566 }
567 
568 
574 template<typename SparseMatrixT, typename NumericT>
576 inplace_solve(const SparseMatrixT & mat,
579 {
580  csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
581  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
582  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
583  detail::cuda_arg<NumericT>(vec),
584  static_cast<unsigned int>(mat.size1())
585  );
586  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
587 }
588 
589 
590 
596 template<typename SparseMatrixT, typename NumericT>
598 inplace_solve(const SparseMatrixT & mat,
601 {
602  csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
603  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
604  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
605  detail::cuda_arg<NumericT>(vec),
606  static_cast<unsigned int>(mat.size1())
607  );
608  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
609 }
610 
611 
617 template<typename SparseMatrixT, typename NumericT>
619 inplace_solve(const SparseMatrixT & mat,
622 {
623  csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
624  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
625  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
626  detail::cuda_arg<NumericT>(vec),
627  static_cast<unsigned int>(mat.size1())
628  );
629  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
630 }
631 
632 
633 
634 // transposed
635 
641 template<typename SparseMatrixT, typename NumericT>
646 {
647  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
648  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
649  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
650  detail::cuda_arg<NumericT>(vec),
651  static_cast<unsigned int>(mat.lhs().size1())
652  );
653  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
654 }
655 
656 
662 template<typename SparseMatrixT, typename NumericT>
667 {
668  viennacl::vector<NumericT> diagonal(vec.size());
669 
670  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
671  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
672  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
673  detail::cuda_arg<NumericT>(diagonal),
674  static_cast<unsigned int>(mat.size1())
675  );
676 
677  csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
678  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
679  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
680  detail::cuda_arg<NumericT>(diagonal),
681  detail::cuda_arg<NumericT>(vec),
682  static_cast<unsigned int>(mat.lhs().size1())
683  );
684  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
685 }
686 
687 
693 template<typename SparseMatrixT, typename NumericT>
698 {
699  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
700  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
701  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
702  detail::cuda_arg<NumericT>(vec),
703  static_cast<unsigned int>(mat.lhs().size1())
704  );
705  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
706 }
707 
708 
714 template<typename SparseMatrixT, typename NumericT>
719 {
720  viennacl::vector<NumericT> diagonal(vec.size());
721 
722  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
723  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
724  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
725  detail::cuda_arg<NumericT>(diagonal),
726  static_cast<unsigned int>(mat.size1())
727  );
728 
729  csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
730  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
731  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
732  detail::cuda_arg<NumericT>(diagonal),
733  detail::cuda_arg<NumericT>(vec),
734  static_cast<unsigned int>(mat.lhs().size1())
735  );
736  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
737 }
738 
739 namespace detail
740 {
741  //
742  // block solves
743  //
744  template<typename NumericT, unsigned int AlignmentV>
747  op_trans> & L,
748  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
749  vector_base<NumericT> const & /* L_diagonal */, //ignored
750  vector_base<NumericT> & vec,
752  {
753  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
754  detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
755  detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
756  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
757  detail::cuda_arg<NumericT>(vec),
758  static_cast<unsigned int>(L.lhs().size1())
759  );
760  }
761 
762 
763  template<typename NumericT, unsigned int AlignmentV>
766  op_trans> & U,
767  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
768  vector_base<NumericT> const & U_diagonal,
769  vector_base<NumericT> & vec,
771  {
772  csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
773  detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
774  detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
775  detail::cuda_arg<NumericT>(U_diagonal.handle().cuda_handle()),
776  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
777  detail::cuda_arg<NumericT>(vec),
778  static_cast<unsigned int>(U.lhs().size1())
779  );
780  }
781 
782 
783 }
784 
785 
786 //
787 // Compressed Compressed Matrix
788 //
789 
790 template<typename NumericT>
792  const unsigned int * row_jumper,
793  const unsigned int * row_indices,
794  const unsigned int * column_indices,
795  const NumericT * elements,
796  unsigned int nonzero_rows,
797  const NumericT * x,
798  unsigned int start_x,
799  unsigned int inc_x,
800  NumericT * result,
801  unsigned int start_result,
802  unsigned int inc_result,
803  unsigned int size_result)
804 {
805  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
806  i < size_result;
807  i += gridDim.x * blockDim.x)
808  {
809  result[i * inc_result + start_result] = 0;
810  }
811 
812  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
813  i < nonzero_rows;
814  i += gridDim.x * blockDim.x)
815  {
816  NumericT dot_prod = NumericT(0);
817  unsigned int row_end = row_jumper[i+1];
818  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
819  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
820  result[row_indices[i] * inc_result + start_result] = dot_prod;
821  }
822 }
823 
824 
833 template<typename NumericT>
837 {
838  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
839  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
840  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
841  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
842  static_cast<unsigned int>(mat.nnz1()),
843  detail::cuda_arg<NumericT>(vec),
844  static_cast<unsigned int>(vec.start()),
845  static_cast<unsigned int>(vec.stride()),
846  detail::cuda_arg<NumericT>(result),
847  static_cast<unsigned int>(result.start()),
848  static_cast<unsigned int>(result.stride()),
849  static_cast<unsigned int>(result.size())
850  );
851  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
852 }
853 
854 //
855 // Coordinate Matrix
856 //
857 
858 
859 namespace detail
860 {
861 
862  template<typename NumericT>
863  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
864  const NumericT * elements,
865  const unsigned int * group_boundaries,
866  NumericT * result,
867  unsigned int option)
868  {
869  __shared__ unsigned int shared_rows[128];
870  __shared__ NumericT inter_results[128];
871 
872  uint2 tmp;
873  NumericT val;
874  unsigned int last_index = blockDim.x - 1;
875  unsigned int group_start = group_boundaries[blockIdx.x];
876  unsigned int group_end = group_boundaries[blockIdx.x + 1];
877  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
878 
879  unsigned int local_index = 0;
880 
881  for (unsigned int k = 0; k < k_end; ++k)
882  {
883  local_index = group_start + k * blockDim.x + threadIdx.x;
884 
885  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
886  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
887 
888  //check for carry from previous loop run:
889  if (threadIdx.x == 0 && k > 0)
890  {
891  if (tmp.x == shared_rows[last_index])
892  {
893  switch (option)
894  {
895  case 0: //inf-norm
896  case 3: //diagonal entry
897  val = max(val, fabs(inter_results[last_index]));
898  break;
899 
900  case 1: //1-norm
901  val = fabs(val) + inter_results[last_index];
902  break;
903 
904  case 2: //2-norm
905  val = sqrt(val * val + inter_results[last_index]);
906  break;
907 
908  default:
909  break;
910  }
911  }
912  else
913  {
914  switch (option)
915  {
916  case 0: //inf-norm
917  case 1: //1-norm
918  case 3: //diagonal entry
919  result[shared_rows[last_index]] = inter_results[last_index];
920  break;
921 
922  case 2: //2-norm
923  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
924  default:
925  break;
926  }
927  }
928  }
929 
930  //segmented parallel reduction begin
931  __syncthreads();
932  shared_rows[threadIdx.x] = tmp.x;
933  switch (option)
934  {
935  case 0:
936  case 3:
937  inter_results[threadIdx.x] = val;
938  break;
939  case 1:
940  inter_results[threadIdx.x] = fabs(val);
941  break;
942  case 2:
943  inter_results[threadIdx.x] = val * val;
944  default:
945  break;
946  }
947  NumericT left = 0;
948  __syncthreads();
949 
950  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
951  {
952  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
953  __syncthreads();
954  switch (option)
955  {
956  case 0: //inf-norm
957  case 3: //diagonal entry
958  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
959  break;
960 
961  case 1: //1-norm
962  inter_results[threadIdx.x] += left;
963  break;
964 
965  case 2: //2-norm
966  inter_results[threadIdx.x] += left;
967  break;
968 
969  default:
970  break;
971  }
972  __syncthreads();
973  }
974  //segmented parallel reduction end
975 
976  if (threadIdx.x != last_index &&
977  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
978  inter_results[threadIdx.x] != 0)
979  {
980  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
981  }
982 
983  __syncthreads();
984  } //for k
985 
986  if (threadIdx.x == last_index && inter_results[last_index] != 0)
987  result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
988  }
989 
990  template<typename NumericT, unsigned int AlignmentV>
992  vector_base<NumericT> & vec,
994  {
995  coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
996  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
997  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
998  detail::cuda_arg<NumericT>(vec),
999  static_cast<unsigned int>(info_selector)
1000  );
1001  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
1002  }
1003 
1004 } //namespace detail
1005 
1006 
1007 template<typename NumericT>
1008 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1009  const NumericT * elements,
1010  const unsigned int * group_boundaries,
1011  const NumericT * x,
1012  unsigned int start_x,
1013  unsigned int inc_x,
1014  NumericT * result,
1015  unsigned int start_result,
1016  unsigned int inc_result
1017  )
1018 {
1019  __shared__ unsigned int shared_rows[128];
1020  __shared__ NumericT inter_results[128];
1021 
1022  uint2 tmp;
1023  NumericT val;
1024  unsigned int group_start = group_boundaries[blockIdx.x];
1025  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1026  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1027 
1028  unsigned int local_index = 0;
1029 
1030  for (unsigned int k = 0; k < k_end; ++k)
1031  {
1032  local_index = group_start + k * blockDim.x + threadIdx.x;
1033 
1034  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1035  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1036 
1037  //check for carry from previous loop run:
1038  if (threadIdx.x == 0 && k > 0)
1039  {
1040  if (tmp.x == shared_rows[blockDim.x-1])
1041  val += inter_results[blockDim.x-1];
1042  else
1043  result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1044  }
1045 
1046  //segmented parallel reduction begin
1047  __syncthreads();
1048  shared_rows[threadIdx.x] = tmp.x;
1049  inter_results[threadIdx.x] = val;
1050  NumericT left = 0;
1051  __syncthreads();
1052 
1053  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1054  {
1055  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1056  __syncthreads();
1057  inter_results[threadIdx.x] += left;
1058  __syncthreads();
1059  }
1060  //segmented parallel reduction end
1061 
1062  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1063  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1064  {
1065  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1066  }
1067 
1068  __syncthreads();
1069  } //for k
1070 
1071  if (local_index + 1 == group_end)
1072  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1073 }
1074 
1075 
1084 template<typename NumericT, unsigned int AlignmentV>
1086  const viennacl::vector_base<NumericT> & vec,
1088 {
1089  result.clear();
1090 
1091  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
1092  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1093  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1094  detail::cuda_arg<NumericT>(vec),
1095  static_cast<unsigned int>(vec.start()),
1096  static_cast<unsigned int>(vec.stride()),
1097  detail::cuda_arg<NumericT>(result),
1098  static_cast<unsigned int>(result.start()),
1099  static_cast<unsigned int>(result.stride())
1100  );
1101  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
1102 }
1103 
1104 
1105 
1106 
1107 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1108 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1109  const NumericT * elements,
1110  const unsigned int * group_boundaries,
1111  const NumericT * d_mat,
1112  unsigned int d_mat_row_start,
1113  unsigned int d_mat_col_start,
1114  unsigned int d_mat_row_inc,
1115  unsigned int d_mat_col_inc,
1116  unsigned int d_mat_row_size,
1117  unsigned int d_mat_col_size,
1118  unsigned int d_mat_internal_rows,
1119  unsigned int d_mat_internal_cols,
1120  NumericT * result,
1121  unsigned int result_row_start,
1122  unsigned int result_col_start,
1123  unsigned int result_row_inc,
1124  unsigned int result_col_inc,
1125  unsigned int result_row_size,
1126  unsigned int result_col_size,
1127  unsigned int result_internal_rows,
1128  unsigned int result_internal_cols)
1129 {
1130  __shared__ unsigned int shared_rows[128];
1131  __shared__ NumericT inter_results[128];
1132 
1133  uint2 tmp;
1134  NumericT val;
1135  unsigned int group_start = group_boundaries[blockIdx.x];
1136  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1137  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1138 
1139  unsigned int local_index = 0;
1140 
1141  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1142  {
1143  for (unsigned int k = 0; k < k_end; ++k)
1144  {
1145  local_index = group_start + k * blockDim.x + threadIdx.x;
1146 
1147  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1148  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1149  d_mat_row_start, d_mat_row_inc,
1150  d_mat_col_start, d_mat_col_inc,
1151  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1152 
1153  //check for carry from previous loop run:
1154  if (threadIdx.x == 0 && k > 0)
1155  {
1156  if (tmp.x == shared_rows[blockDim.x-1])
1157  val += inter_results[blockDim.x-1];
1158  else
1159  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1160  result_row_start, result_row_inc,
1161  result_col_start, result_col_inc,
1162  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1163  }
1164 
1165  //segmented parallel reduction begin
1166  __syncthreads();
1167  shared_rows[threadIdx.x] = tmp.x;
1168  inter_results[threadIdx.x] = val;
1169  NumericT left = 0;
1170  __syncthreads();
1171 
1172  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1173  {
1174  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1175  __syncthreads();
1176  inter_results[threadIdx.x] += left;
1177  __syncthreads();
1178  }
1179  //segmented parallel reduction end
1180 
1181  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1182  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1183  {
1184  result[ResultIndexT::apply(tmp.x, result_col,
1185  result_row_start, result_row_inc,
1186  result_col_start, result_col_inc,
1187  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1188  }
1189 
1190  __syncthreads();
1191  } //for k
1192 
1193  if (local_index + 1 == group_end)
1194  result[ResultIndexT::apply(tmp.x, result_col,
1195  result_row_start, result_row_inc,
1196  result_col_start, result_col_inc,
1197  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1198  }
1199 }
1200 
1201 
1210 template<typename NumericT, unsigned int AlignmentV>
1212  const viennacl::matrix_base<NumericT> & d_mat,
1214 {
1215  if (d_mat.row_major() && result.row_major())
1216  {
1217  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1218  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1219  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1220  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1221 
1222  detail::cuda_arg<NumericT>(d_mat),
1223  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1224  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1225  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1226  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1227 
1228  detail::cuda_arg<NumericT>(result),
1229  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1230  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1231  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1232  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1233  );
1234  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1235  }
1236  else if (d_mat.row_major() && !result.row_major())
1237  {
1238  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1239  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1240  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1241  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1242 
1243  detail::cuda_arg<NumericT>(d_mat),
1244  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1245  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1246  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1247  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1248 
1249  detail::cuda_arg<NumericT>(result),
1250  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1251  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1252  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1253  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1254  );
1255  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1256  }
1257  else if (!d_mat.row_major() && result.row_major())
1258  {
1259  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1260  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1261  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1262  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1263 
1264  detail::cuda_arg<NumericT>(d_mat),
1265  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1266  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1267  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1268  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1269 
1270  detail::cuda_arg<NumericT>(result),
1271  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1272  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1273  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1274  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1275  );
1276  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1277  }
1278  else
1279  {
1280  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1281  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1282  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1283  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1284 
1285  detail::cuda_arg<NumericT>(d_mat),
1286  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1287  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1288  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1289  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1290 
1291  detail::cuda_arg<NumericT>(result),
1292  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1293  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1294  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1295  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1296  );
1297  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1298  }
1299 
1300 }
1301 
1302 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1303 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1304  const NumericT * elements,
1305  const unsigned int * group_boundaries,
1306  const NumericT * d_mat,
1307  unsigned int d_mat_row_start,
1308  unsigned int d_mat_col_start,
1309  unsigned int d_mat_row_inc,
1310  unsigned int d_mat_col_inc,
1311  unsigned int d_mat_row_size,
1312  unsigned int d_mat_col_size,
1313  unsigned int d_mat_internal_rows,
1314  unsigned int d_mat_internal_cols,
1315  NumericT * result,
1316  unsigned int result_row_start,
1317  unsigned int result_col_start,
1318  unsigned int result_row_inc,
1319  unsigned int result_col_inc,
1320  unsigned int result_row_size,
1321  unsigned int result_col_size,
1322  unsigned int result_internal_rows,
1323  unsigned int result_internal_cols)
1324 {
1325  __shared__ unsigned int shared_rows[128];
1326  __shared__ NumericT inter_results[128];
1327 
1328  uint2 tmp;
1329  NumericT val;
1330  unsigned int group_start = group_boundaries[blockIdx.x];
1331  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1332  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1333 
1334  unsigned int local_index = 0;
1335 
1336  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1337  {
1338  for (unsigned int k = 0; k < k_end; ++k)
1339  {
1340  local_index = group_start + k * blockDim.x + threadIdx.x;
1341 
1342  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1343  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1344  d_mat_row_start, d_mat_row_inc,
1345  d_mat_col_start, d_mat_col_inc,
1346  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1347 
1348  //check for carry from previous loop run:
1349  if (threadIdx.x == 0 && k > 0)
1350  {
1351  if (tmp.x == shared_rows[blockDim.x-1])
1352  val += inter_results[blockDim.x-1];
1353  else
1354  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1355  result_row_start, result_row_inc,
1356  result_col_start, result_col_inc,
1357  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1358  }
1359 
1360  //segmented parallel reduction begin
1361  __syncthreads();
1362  shared_rows[threadIdx.x] = tmp.x;
1363  inter_results[threadIdx.x] = val;
1364  NumericT left = 0;
1365  __syncthreads();
1366 
1367  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1368  {
1369  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1370  __syncthreads();
1371  inter_results[threadIdx.x] += left;
1372  __syncthreads();
1373  }
1374  //segmented parallel reduction end
1375 
1376  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1377  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1378  {
1379  result[ ResultIndexT::apply(tmp.x, result_col,
1380  result_row_start, result_row_inc,
1381  result_col_start, result_col_inc,
1382  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1383  }
1384 
1385  __syncthreads();
1386  } //for k
1387 
1388  if (local_index + 1 == group_end)
1389  result[ ResultIndexT::apply(tmp.x, result_col,
1390  result_row_start, result_row_inc,
1391  result_col_start, result_col_inc,
1392  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1393  }
1394 }
1395 
1404 template<typename NumericT, unsigned int AlignmentV>
1408  viennacl::op_trans > & d_mat,
1410 {
1411  if (d_mat.lhs().row_major() && result.row_major())
1412  {
1413  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1414  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1415  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1416  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1417 
1418  detail::cuda_arg<NumericT>(d_mat.lhs()),
1419  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1420  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1421  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1422  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1423 
1424  detail::cuda_arg<NumericT>(result),
1425  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1426  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1427  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1428  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1429  );
1430  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1431  }
1432  else if (d_mat.lhs().row_major() && !result.row_major())
1433  {
1434  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1435  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1436  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1437  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1438 
1439  detail::cuda_arg<NumericT>(d_mat.lhs()),
1440  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1441  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1442  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1443  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1444 
1445  detail::cuda_arg<NumericT>(result),
1446  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1447  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1448  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1449  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1450  );
1451  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1452  }
1453  else if (!d_mat.lhs().row_major() && result.row_major())
1454  {
1455  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1456  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1457  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1458  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1459 
1460  detail::cuda_arg<NumericT>(d_mat.lhs()),
1461  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1462  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1463  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1464  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1465 
1466  detail::cuda_arg<NumericT>(result),
1467  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1468  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1469  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1470  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1471  );
1472  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1473  }
1474  else
1475  {
1476  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1477  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1478  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1479  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1480 
1481  detail::cuda_arg<NumericT>(d_mat.lhs()),
1482  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1483  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1484  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1485  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1486 
1487  detail::cuda_arg<NumericT>(result),
1488  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1489  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1490  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1491  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1492  );
1493  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1494  }
1495 }
1496 
1497 
1498 //
1499 // ELL Matrix
1500 //
1501 
1502 template<typename NumericT>
1503 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1504  const NumericT * elements,
1505  const NumericT * x,
1506  unsigned int start_x,
1507  unsigned int inc_x,
1508  NumericT * result,
1509  unsigned int start_result,
1510  unsigned int inc_result,
1511  unsigned int row_num,
1512  unsigned int col_num,
1513  unsigned int internal_row_num,
1514  unsigned int items_per_row,
1515  unsigned int aligned_items_per_row
1516  )
1517 {
1518  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1519  unsigned int glb_sz = gridDim.x * blockDim.x;
1520 
1521  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1522  {
1523  NumericT sum = 0;
1524 
1525  unsigned int offset = row_id;
1526  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1527  {
1528  NumericT val = elements[offset];
1529 
1530  if (val != NumericT(0))
1531  {
1532  int col = coords[offset];
1533  sum += x[col * inc_x + start_x] * val;
1534  }
1535  }
1536 
1537  result[row_id * inc_result + start_result] = sum;
1538  }
1539 }
1540 
1541 
1550 template<typename NumericT, unsigned int AlignmentV>
1552  const viennacl::vector_base<NumericT> & vec,
1554 {
1555  ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1556  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1557  detail::cuda_arg<NumericT>(vec),
1558  static_cast<unsigned int>(vec.start()),
1559  static_cast<unsigned int>(vec.stride()),
1560  detail::cuda_arg<NumericT>(result),
1561  static_cast<unsigned int>(result.start()),
1562  static_cast<unsigned int>(result.stride()),
1563  static_cast<unsigned int>(mat.size1()),
1564  static_cast<unsigned int>(mat.size2()),
1565  static_cast<unsigned int>(mat.internal_size1()),
1566  static_cast<unsigned int>(mat.maxnnz()),
1567  static_cast<unsigned int>(mat.internal_maxnnz())
1568  );
1569  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1570 }
1571 
1572 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1573 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1574  const NumericT * sp_mat_elements,
1575  unsigned int sp_mat_row_num,
1576  unsigned int sp_mat_col_num,
1577  unsigned int sp_mat_internal_row_num,
1578  unsigned int sp_mat_items_per_row,
1579  unsigned int sp_mat_aligned_items_per_row,
1580  const NumericT * d_mat,
1581  unsigned int d_mat_row_start,
1582  unsigned int d_mat_col_start,
1583  unsigned int d_mat_row_inc,
1584  unsigned int d_mat_col_inc,
1585  unsigned int d_mat_row_size,
1586  unsigned int d_mat_col_size,
1587  unsigned int d_mat_internal_rows,
1588  unsigned int d_mat_internal_cols,
1589  NumericT * result,
1590  unsigned int result_row_start,
1591  unsigned int result_col_start,
1592  unsigned int result_row_inc,
1593  unsigned int result_col_inc,
1594  unsigned int result_row_size,
1595  unsigned int result_col_size,
1596  unsigned int result_internal_rows,
1597  unsigned int result_internal_cols)
1598 {
1599  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1600  unsigned int glb_sz = gridDim.x * blockDim.x;
1601 
1602  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1603  {
1604  unsigned int row = rc % sp_mat_row_num;
1605  unsigned int col = rc / sp_mat_row_num;
1606 
1607  unsigned int offset = row;
1608  NumericT r = (NumericT)0;
1609 
1610  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1611  {
1612  unsigned int j = sp_mat_coords[offset];
1613  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1614 
1615  if (x != (NumericT)0)
1616  {
1617  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1618  d_mat_row_start, d_mat_row_inc,
1619  d_mat_col_start, d_mat_col_inc,
1620  d_mat_internal_rows, d_mat_internal_cols) ];
1621 
1622  r += x*y;
1623  }
1624  }
1625  result [ ResultIndexT::apply(row, col,
1626  result_row_start, result_row_inc,
1627  result_col_start, result_col_inc,
1628  result_internal_rows, result_internal_cols) ] = r;
1629  }
1630 
1631 }
1632 
1642 template<typename NumericT, unsigned int AlignmentV>
1644  const viennacl::matrix_base<NumericT> & d_mat,
1646 {
1647  if (d_mat.row_major() && result.row_major())
1648  {
1649  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1650  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1651  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1652  static_cast<unsigned int>(sp_mat.size1()),
1653  static_cast<unsigned int>(sp_mat.size2()),
1654  static_cast<unsigned int>(sp_mat.internal_size1()),
1655  static_cast<unsigned int>(sp_mat.maxnnz()),
1656  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1657  detail::cuda_arg<NumericT>(d_mat),
1658  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1659  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1660  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1661  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1662 
1663  detail::cuda_arg<NumericT>(result),
1664  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1665  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1666  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1667  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1668  );
1669  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1670  }
1671  else if (d_mat.row_major() && !result.row_major())
1672  {
1673  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1674  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1675  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1676  static_cast<unsigned int>(sp_mat.size1()),
1677  static_cast<unsigned int>(sp_mat.size2()),
1678  static_cast<unsigned int>(sp_mat.internal_size1()),
1679  static_cast<unsigned int>(sp_mat.maxnnz()),
1680  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1681  detail::cuda_arg<NumericT>(d_mat),
1682  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1683  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1684  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1685  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1686 
1687  detail::cuda_arg<NumericT>(result),
1688  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1689  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1690  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1691  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1692  );
1693  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1694  }
1695  else if (!d_mat.row_major() && result.row_major())
1696  {
1697  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1698  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1699  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1700  static_cast<unsigned int>(sp_mat.size1()),
1701  static_cast<unsigned int>(sp_mat.size2()),
1702  static_cast<unsigned int>(sp_mat.internal_size1()),
1703  static_cast<unsigned int>(sp_mat.maxnnz()),
1704  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1705  detail::cuda_arg<NumericT>(d_mat),
1706  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1707  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1708  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1709  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1710 
1711  detail::cuda_arg<NumericT>(result),
1712  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1713  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1714  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1715  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1716  );
1717  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1718  }
1719  else
1720  {
1721  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1722  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1723  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1724  static_cast<unsigned int>(sp_mat.size1()),
1725  static_cast<unsigned int>(sp_mat.size2()),
1726  static_cast<unsigned int>(sp_mat.internal_size1()),
1727  static_cast<unsigned int>(sp_mat.maxnnz()),
1728  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1729  detail::cuda_arg<NumericT>(d_mat),
1730  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1731  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1732  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1733  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1734 
1735  detail::cuda_arg<NumericT>(result),
1736  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1737  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1738  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1739  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1740  );
1741  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1742  }
1743 }
1744 
1745 template<typename DMatIndexT, typename ResultIndexT, typename NumericT >
1746 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
1747  const NumericT * sp_mat_elements,
1748  unsigned int sp_mat_row_num,
1749  unsigned int sp_mat_col_num,
1750  unsigned int sp_mat_internal_row_num,
1751  unsigned int sp_mat_items_per_row,
1752  unsigned int sp_mat_aligned_items_per_row,
1753  const NumericT * d_mat,
1754  unsigned int d_mat_row_start,
1755  unsigned int d_mat_col_start,
1756  unsigned int d_mat_row_inc,
1757  unsigned int d_mat_col_inc,
1758  unsigned int d_mat_row_size,
1759  unsigned int d_mat_col_size,
1760  unsigned int d_mat_internal_rows,
1761  unsigned int d_mat_internal_cols,
1762  NumericT * result,
1763  unsigned int result_row_start,
1764  unsigned int result_col_start,
1765  unsigned int result_row_inc,
1766  unsigned int result_col_inc,
1767  unsigned int result_row_size,
1768  unsigned int result_col_size,
1769  unsigned int result_internal_rows,
1770  unsigned int result_internal_cols)
1771 {
1772  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1773  unsigned int glb_sz = gridDim.x * blockDim.x;
1774 
1775  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1776  {
1777  unsigned int row = rc % sp_mat_row_num;
1778  unsigned int col = rc / sp_mat_row_num;
1779 
1780  unsigned int offset = row;
1781  NumericT r = (NumericT)0;
1782 
1783  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1784  {
1785  unsigned int j = sp_mat_coords[offset];
1786  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1787 
1788  if (x != (NumericT)0)
1789  {
1790  NumericT y = d_mat[ DMatIndexT::apply(col, j,
1791  d_mat_row_start, d_mat_row_inc,
1792  d_mat_col_start, d_mat_col_inc,
1793  d_mat_internal_rows, d_mat_internal_cols) ];
1794 
1795  r += x*y;
1796  }
1797  }
1798  result [ ResultIndexT::apply(row, col,
1799  result_row_start, result_row_inc,
1800  result_col_start, result_col_inc,
1801  result_internal_rows, result_internal_cols) ] = r;
1802  }
1803 
1804 }
1805 
1815 template<typename NumericT, unsigned int AlignmentV>
1819  viennacl::op_trans > & d_mat,
1821 {
1822  if (d_mat.lhs().row_major() && result.row_major())
1823  {
1824  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1825  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1826  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1827  static_cast<unsigned int>(sp_mat.size1()),
1828  static_cast<unsigned int>(sp_mat.size2()),
1829  static_cast<unsigned int>(sp_mat.internal_size1()),
1830  static_cast<unsigned int>(sp_mat.maxnnz()),
1831  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1832 
1833  detail::cuda_arg<NumericT>(d_mat.lhs()),
1834  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1835  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1836  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1837  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1838 
1839  detail::cuda_arg<NumericT>(result),
1840  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1841  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1842  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1843  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1844  );
1845  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1846  }
1847  else if (d_mat.lhs().row_major() && !result.row_major())
1848  {
1849  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1850  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1851  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1852  static_cast<unsigned int>(sp_mat.size1()),
1853  static_cast<unsigned int>(sp_mat.size2()),
1854  static_cast<unsigned int>(sp_mat.internal_size1()),
1855  static_cast<unsigned int>(sp_mat.maxnnz()),
1856  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1857 
1858  detail::cuda_arg<NumericT>(d_mat.lhs()),
1859  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1860  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1861  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1862  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1863 
1864  detail::cuda_arg<NumericT>(result),
1865  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1866  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1867  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1868  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1869  );
1870  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1871  }
1872  else if (!d_mat.lhs().row_major() && result.row_major())
1873  {
1874  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1875  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1876  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1877  static_cast<unsigned int>(sp_mat.size1()),
1878  static_cast<unsigned int>(sp_mat.size2()),
1879  static_cast<unsigned int>(sp_mat.internal_size1()),
1880  static_cast<unsigned int>(sp_mat.maxnnz()),
1881  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1882 
1883  detail::cuda_arg<NumericT>(d_mat.lhs()),
1884  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1885  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1886  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1887  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1888 
1889  detail::cuda_arg<NumericT>(result),
1890  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1891  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1892  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1893  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1894  );
1895  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1896  }
1897  else
1898  {
1899  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1900  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1901  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1902  static_cast<unsigned int>(sp_mat.size1()),
1903  static_cast<unsigned int>(sp_mat.size2()),
1904  static_cast<unsigned int>(sp_mat.internal_size1()),
1905  static_cast<unsigned int>(sp_mat.maxnnz()),
1906  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1907 
1908  detail::cuda_arg<NumericT>(d_mat.lhs()),
1909  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1910  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1911  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1912  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1913 
1914  detail::cuda_arg<NumericT>(result),
1915  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1916  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1917  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1918  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1919  );
1920  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1921  }
1922 }
1923 
1924 //
1925 // SELL-C-\sigma Matrix
1926 //
1927 
1928 template<typename NumericT>
1929 __global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int * columns_per_block,
1930  const unsigned int * column_indices,
1931  const unsigned int * block_start,
1932  const NumericT * elements,
1933  const NumericT * x,
1934  unsigned int start_x,
1935  unsigned int inc_x,
1936  unsigned int size_x,
1937  NumericT * result,
1938  unsigned int start_result,
1939  unsigned int inc_result,
1940  unsigned int size_result)
1941 {
1942  unsigned int local_id = threadIdx.x;
1943  unsigned int local_size = blockDim.x;
1944  unsigned int num_rows = size_result;
1945 
1946  for (unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
1947  {
1948  unsigned int row = block_idx * local_size + local_id;
1949  unsigned int offset = block_start[block_idx];
1950  unsigned int num_columns = columns_per_block[block_idx];
1951 
1952  NumericT sum = 0;
1953  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
1954  {
1955  unsigned int index = offset + item_id * local_size + local_id;
1956  NumericT val = elements[index];
1957 
1958  sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
1959  }
1960 
1961  if (row < num_rows)
1962  result[row * inc_result + start_result] = sum;
1963  }
1964 }
1965 
1974 template<typename NumericT, typename IndexT>
1976  const viennacl::vector_base<NumericT> & vec,
1978 {
1979  sliced_ell_matrix_vec_mul_kernel<<<128, mat.rows_per_block()>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
1980  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1981  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1982  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1983  detail::cuda_arg<NumericT>(vec),
1984  static_cast<unsigned int>(vec.start()),
1985  static_cast<unsigned int>(vec.stride()),
1986  static_cast<unsigned int>(vec.size()),
1987  detail::cuda_arg<NumericT>(result),
1988  static_cast<unsigned int>(result.start()),
1989  static_cast<unsigned int>(result.stride()),
1990  static_cast<unsigned int>(result.size())
1991  );
1992  VIENNACL_CUDA_LAST_ERROR_CHECK("sliced_ell_matrix_vec_mul_kernel");
1993 }
1994 
1995 
1996 //
1997 // Hybrid Matrix
1998 //
1999 
2000 
2001 template<typename NumericT>
2002 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
2003  const NumericT * ell_elements,
2004  const unsigned int * csr_rows,
2005  const unsigned int * csr_cols,
2006  const NumericT * csr_elements,
2007  const NumericT * x,
2008  unsigned int start_x,
2009  unsigned int inc_x,
2010  NumericT * result,
2011  unsigned int start_result,
2012  unsigned int inc_result,
2013  unsigned int row_num,
2014  unsigned int internal_row_num,
2015  unsigned int items_per_row,
2016  unsigned int aligned_items_per_row
2017  )
2018 {
2019  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2020  unsigned int glb_sz = gridDim.x * blockDim.x;
2021 
2022  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2023  {
2024  NumericT sum = 0;
2025 
2026  unsigned int offset = row_id;
2027  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2028  {
2029  NumericT val = ell_elements[offset];
2030 
2031 
2032  if (val != NumericT(0))
2033  {
2034  int col = ell_coords[offset];
2035  sum += (x[col * inc_x + start_x] * val);
2036  }
2037  }
2038 
2039  unsigned int col_begin = csr_rows[row_id];
2040  unsigned int col_end = csr_rows[row_id + 1];
2041 
2042  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2043  sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2044 
2045  result[row_id * inc_result + start_result] = sum;
2046  }
2047 }
2048 
2049 
2050 
2059 template<typename NumericT, unsigned int AlignmentV>
2061  const viennacl::vector_base<NumericT> & vec,
2063 {
2064  hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2065  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2066  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2067  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2068  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2069  detail::cuda_arg<NumericT>(vec),
2070  static_cast<unsigned int>(vec.start()),
2071  static_cast<unsigned int>(vec.stride()),
2072  detail::cuda_arg<NumericT>(result),
2073  static_cast<unsigned int>(result.start()),
2074  static_cast<unsigned int>(result.stride()),
2075  static_cast<unsigned int>(mat.size1()),
2076  static_cast<unsigned int>(mat.internal_size1()),
2077  static_cast<unsigned int>(mat.ell_nnz()),
2078  static_cast<unsigned int>(mat.internal_ellnnz())
2079  );
2080  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2081 }
2082 
2083 
2084 
2085 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2086 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
2087  const NumericT * ell_elements,
2088  const unsigned int * csr_rows,
2089  const unsigned int * csr_cols,
2090  const NumericT * csr_elements,
2091  unsigned int row_num,
2092  unsigned int internal_row_num,
2093  unsigned int items_per_row,
2094  unsigned int aligned_items_per_row,
2095  const NumericT * d_mat,
2096  unsigned int d_mat_row_start,
2097  unsigned int d_mat_col_start,
2098  unsigned int d_mat_row_inc,
2099  unsigned int d_mat_col_inc,
2100  unsigned int d_mat_row_size,
2101  unsigned int d_mat_col_size,
2102  unsigned int d_mat_internal_rows,
2103  unsigned int d_mat_internal_cols,
2104  NumericT * result,
2105  unsigned int result_row_start,
2106  unsigned int result_col_start,
2107  unsigned int result_row_inc,
2108  unsigned int result_col_inc,
2109  unsigned int result_row_size,
2110  unsigned int result_col_size,
2111  unsigned int result_internal_rows,
2112  unsigned int result_internal_cols)
2113 {
2114  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2115  unsigned int glb_sz = gridDim.x * blockDim.x;
2116 
2117  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2118  {
2119  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2120  {
2121  NumericT sum = 0;
2122 
2123  unsigned int offset = row_id;
2124  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2125  {
2126  NumericT val = ell_elements[offset];
2127 
2128  if (val != 0.0f)
2129  {
2130  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2131  d_mat_row_start, d_mat_row_inc,
2132  d_mat_col_start, d_mat_col_inc,
2133  d_mat_internal_rows, d_mat_internal_cols)] * val;
2134  }
2135  }
2136 
2137  unsigned int col_begin = csr_rows[row_id];
2138  unsigned int col_end = csr_rows[row_id + 1];
2139 
2140  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2141  {
2142  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2143  d_mat_row_start, d_mat_row_inc,
2144  d_mat_col_start, d_mat_col_inc,
2145  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2146  }
2147 
2148  result[ResultIndexT::apply(row_id, result_col,
2149  result_row_start, result_row_inc,
2150  result_col_start, result_col_inc,
2151  result_internal_rows, result_internal_cols)] = sum;
2152  }
2153  }
2154 }
2155 
2156 
2157 
2166 template<typename NumericT, unsigned int AlignmentV>
2168  const viennacl::matrix_base<NumericT> & d_mat,
2170 {
2171  if (d_mat.row_major() && result.row_major())
2172  {
2173  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2174  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2175  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2176  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2177  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2178  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2179  static_cast<unsigned int>(mat.size1()),
2180  static_cast<unsigned int>(mat.internal_size1()),
2181  static_cast<unsigned int>(mat.ell_nnz()),
2182  static_cast<unsigned int>(mat.internal_ellnnz()),
2183 
2184  detail::cuda_arg<NumericT>(d_mat),
2185  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2186  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2187  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2188  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2189 
2190  detail::cuda_arg<NumericT>(result),
2191  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2192  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2193  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2194  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2195  );
2196  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2197  }
2198  else if (d_mat.row_major() && !result.row_major())
2199  {
2200  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2201  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2202  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2203  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2204  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2205  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2206  static_cast<unsigned int>(mat.size1()),
2207  static_cast<unsigned int>(mat.internal_size1()),
2208  static_cast<unsigned int>(mat.ell_nnz()),
2209  static_cast<unsigned int>(mat.internal_ellnnz()),
2210 
2211  detail::cuda_arg<NumericT>(d_mat),
2212  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2213  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2214  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2215  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2216 
2217  detail::cuda_arg<NumericT>(result),
2218  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2219  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2220  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2221  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2222  );
2223  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2224  }
2225  else if (!d_mat.row_major() && result.row_major())
2226  {
2227  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2228  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2229  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2230  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2231  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2232  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2233  static_cast<unsigned int>(mat.size1()),
2234  static_cast<unsigned int>(mat.internal_size1()),
2235  static_cast<unsigned int>(mat.ell_nnz()),
2236  static_cast<unsigned int>(mat.internal_ellnnz()),
2237 
2238  detail::cuda_arg<NumericT>(d_mat),
2239  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2240  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2241  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2242  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2243 
2244  detail::cuda_arg<NumericT>(result),
2245  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2246  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2247  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2248  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2249  );
2250  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2251  }
2252  else
2253  {
2254  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2255  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2256  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2257  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2258  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2259  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2260  static_cast<unsigned int>(mat.size1()),
2261  static_cast<unsigned int>(mat.internal_size1()),
2262  static_cast<unsigned int>(mat.ell_nnz()),
2263  static_cast<unsigned int>(mat.internal_ellnnz()),
2264 
2265  detail::cuda_arg<NumericT>(d_mat),
2266  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2267  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2268  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2269  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2270 
2271  detail::cuda_arg<NumericT>(result),
2272  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2273  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2274  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2275  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2276  );
2277  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2278  }
2279 }
2280 
2281 
2282 
2283 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2284 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
2285  const NumericT * ell_elements,
2286  const unsigned int * csr_rows,
2287  const unsigned int * csr_cols,
2288  const NumericT * csr_elements,
2289  unsigned int row_num,
2290  unsigned int internal_row_num,
2291  unsigned int items_per_row,
2292  unsigned int aligned_items_per_row,
2293  const NumericT * d_mat,
2294  unsigned int d_mat_row_start,
2295  unsigned int d_mat_col_start,
2296  unsigned int d_mat_row_inc,
2297  unsigned int d_mat_col_inc,
2298  unsigned int d_mat_row_size,
2299  unsigned int d_mat_col_size,
2300  unsigned int d_mat_internal_rows,
2301  unsigned int d_mat_internal_cols,
2302  NumericT * result,
2303  unsigned int result_row_start,
2304  unsigned int result_col_start,
2305  unsigned int result_row_inc,
2306  unsigned int result_col_inc,
2307  unsigned int result_row_size,
2308  unsigned int result_col_size,
2309  unsigned int result_internal_rows,
2310  unsigned int result_internal_cols)
2311 {
2312  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2313  unsigned int glb_sz = gridDim.x * blockDim.x;
2314 
2315  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2316  {
2317  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2318  {
2319  NumericT sum = 0;
2320 
2321  unsigned int offset = row_id;
2322  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2323  {
2324  NumericT val = ell_elements[offset];
2325 
2326  if (val != 0.0f)
2327  {
2328  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2329  d_mat_row_start, d_mat_row_inc,
2330  d_mat_col_start, d_mat_col_inc,
2331  d_mat_internal_rows, d_mat_internal_cols)] * val;
2332  }
2333  }
2334 
2335  unsigned int col_begin = csr_rows[row_id];
2336  unsigned int col_end = csr_rows[row_id + 1];
2337 
2338  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2339  {
2340  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2341  d_mat_row_start, d_mat_row_inc,
2342  d_mat_col_start, d_mat_col_inc,
2343  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2344  }
2345 
2346  result[ResultIndexT::apply(row_id, result_col,
2347  result_row_start, result_row_inc,
2348  result_col_start, result_col_inc,
2349  result_internal_rows, result_internal_cols)] = sum;
2350  }
2351  }
2352 }
2353 
2354 
2355 
2364 template<typename NumericT, unsigned int AlignmentV>
2368  viennacl::op_trans > & d_mat,
2370 {
2371  if (d_mat.lhs().row_major() && result.row_major())
2372  {
2373  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2374  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2375  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2376  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2377  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2378  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2379  static_cast<unsigned int>(mat.size1()),
2380  static_cast<unsigned int>(mat.internal_size1()),
2381  static_cast<unsigned int>(mat.ell_nnz()),
2382  static_cast<unsigned int>(mat.internal_ellnnz()),
2383 
2384  detail::cuda_arg<NumericT>(d_mat.lhs()),
2385  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2386  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2387  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2388  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2389 
2390  detail::cuda_arg<NumericT>(result),
2391  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2392  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2393  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2394  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2395  );
2396  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2397  }
2398  else if (d_mat.lhs().row_major() && !result.row_major())
2399  {
2400  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2401  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2402  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2403  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2404  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2405  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2406  static_cast<unsigned int>(mat.size1()),
2407  static_cast<unsigned int>(mat.internal_size1()),
2408  static_cast<unsigned int>(mat.ell_nnz()),
2409  static_cast<unsigned int>(mat.internal_ellnnz()),
2410 
2411  detail::cuda_arg<NumericT>(d_mat.lhs()),
2412  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2413  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2414  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2415  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2416 
2417  detail::cuda_arg<NumericT>(result),
2418  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2419  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2420  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2421  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2422  );
2423  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2424  }
2425  else if (!d_mat.lhs().row_major() && result.row_major())
2426  {
2427  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2428  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2429  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2430  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2431  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2432  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2433  static_cast<unsigned int>(mat.size1()),
2434  static_cast<unsigned int>(mat.internal_size1()),
2435  static_cast<unsigned int>(mat.ell_nnz()),
2436  static_cast<unsigned int>(mat.internal_ellnnz()),
2437 
2438  detail::cuda_arg<NumericT>(d_mat.lhs()),
2439  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2440  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2441  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2442  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2443 
2444  detail::cuda_arg<NumericT>(result),
2445  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2446  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2447  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2448  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2449  );
2450  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2451  }
2452  else
2453  {
2454  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2455  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2456  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2457  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2458  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2459  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2460  static_cast<unsigned int>(mat.size1()),
2461  static_cast<unsigned int>(mat.internal_size1()),
2462  static_cast<unsigned int>(mat.ell_nnz()),
2463  static_cast<unsigned int>(mat.internal_ellnnz()),
2464 
2465  detail::cuda_arg<NumericT>(d_mat.lhs()),
2466  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2467  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2468  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2469  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2470 
2471  detail::cuda_arg<NumericT>(result),
2472  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2473  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2474  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2475  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2476  );
2477  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2478  }
2479 }
2480 
2481 
2482 } // namespace cuda
2483 } //namespace linalg
2484 } //namespace viennacl
2485 
2486 
2487 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:30
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:809
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:340
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Definition: preset.hpp:246
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:72
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
A tag class representing an upper triangular matrix.
Definition: forwards.h:814
void inplace_solve(const matrix_base< NumericT > &A, bool trans_A, matrix_base< NumericT > &B, bool trans_B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:74
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:838
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
handle_type & handle()
Definition: ell_matrix.hpp:100
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:239
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:819
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
Definition: forwards.h:219
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
A tag for column-major storage of a dense matrix.
Definition: forwards.h:320
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:66
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:824
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)