ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
32 
33 namespace viennacl
34 {
35 namespace linalg
36 {
37 namespace cuda
38 {
39 //
40 // Compressed matrix
41 //
42 
43 namespace detail
44 {
45 
46  template<typename NumericT>
48  const unsigned int * row_indices,
49  const unsigned int * column_indices,
50  const NumericT * elements,
51  NumericT * result,
52  unsigned int size,
53  unsigned int option)
54  {
55  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
56  row < size;
57  row += gridDim.x * blockDim.x)
58  {
59  NumericT value = 0;
60  unsigned int row_end = row_indices[row+1];
61 
62  switch (option)
63  {
64  case 0: //inf-norm
65  for (unsigned int i = row_indices[row]; i < row_end; ++i)
66  value = max(value, fabs(elements[i]));
67  break;
68 
69  case 1: //1-norm
70  for (unsigned int i = row_indices[row]; i < row_end; ++i)
71  value += fabs(elements[i]);
72  break;
73 
74  case 2: //2-norm
75  for (unsigned int i = row_indices[row]; i < row_end; ++i)
76  value += elements[i] * elements[i];
77  value = sqrt(value);
78  break;
79 
80  case 3: //diagonal entry
81  for (unsigned int i = row_indices[row]; i < row_end; ++i)
82  {
83  if (column_indices[i] == row)
84  {
85  value = elements[i];
86  break;
87  }
88  }
89  break;
90 
91  default:
92  break;
93  }
94  result[row] = value;
95  }
96  }
97 
98 
99  template<typename NumericT, unsigned int AligmentV>
101  vector_base<NumericT> & vec,
103  {
104  csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
105  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
106  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
107  detail::cuda_arg<NumericT>(vec),
108  static_cast<unsigned int>(mat.size1()),
109  static_cast<unsigned int>(info_selector)
110  );
111  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
112  }
113 
114 } //namespace detail
115 
116 
117 template<typename NumericT>
119  const unsigned int * row_indices,
120  const unsigned int * column_indices,
121  const NumericT * elements,
122  const NumericT * x,
123  unsigned int start_x,
124  unsigned int inc_x,
125  NumericT * result,
126  unsigned int start_result,
127  unsigned int inc_result,
128  unsigned int size_result)
129 {
130  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
131  row < size_result;
132  row += gridDim.x * blockDim.x)
133  {
134  NumericT dot_prod = NumericT(0);
135  unsigned int row_end = row_indices[row+1];
136  for (unsigned int i = row_indices[row]; i < row_end; ++i)
137  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138  result[row * inc_result + start_result] = dot_prod;
139  }
140 }
141 
142 
143 
144 
145 template<typename NumericT>
147  const unsigned int * row_indices,
148  const unsigned int * column_indices,
149  const unsigned int * row_blocks,
150  const NumericT * elements,
151  unsigned int num_blocks,
152  const NumericT * x,
153  unsigned int start_x,
154  unsigned int inc_x,
155  NumericT * result,
156  unsigned int start_result,
157  unsigned int inc_result,
158  unsigned int size_result)
159 {
160  __shared__ NumericT shared_elements[1024];
161 
162  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
163  {
164  unsigned int row_start = row_blocks[block_id];
165  unsigned int row_stop = row_blocks[block_id + 1];
166  unsigned int element_start = row_indices[row_start];
167  unsigned int element_stop = row_indices[row_stop];
168  unsigned int rows_to_process = row_stop - row_start;
169 
170  if (rows_to_process > 1) // CSR stream with one thread per row
171  {
172  // load to shared buffer:
173  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
174  shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
175 
176  __syncthreads();
177 
178  // use one thread per row to sum:
179  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
180  {
181  NumericT dot_prod = 0;
182  unsigned int thread_row_start = row_indices[row] - element_start;
183  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
184  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
185  dot_prod += shared_elements[i];
186  result[row * inc_result + start_result] = dot_prod;
187  }
188  }
189  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
190  else // CSR vector for a single row
191  {
192  // load and sum to shared buffer:
193  shared_elements[threadIdx.x] = 0;
194  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195  shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
196 
197  // reduction to obtain final result
198  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
199  {
200  __syncthreads();
201  if (threadIdx.x < stride)
202  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
203  }
204 
205  if (threadIdx.x == 0)
206  result[row_start * inc_result + start_result] = shared_elements[0];
207  }
208 
209  __syncthreads(); // avoid race conditions
210  }
211 }
212 
213 
214 
215 
224 template<class NumericT, unsigned int AlignmentV>
228 {
229  compressed_matrix_vec_mul_adaptive_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
230  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
231  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
232  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
233  static_cast<unsigned int>(mat.blocks1()),
234  detail::cuda_arg<NumericT>(vec),
235  static_cast<unsigned int>(vec.start()),
236  static_cast<unsigned int>(vec.stride()),
237  detail::cuda_arg<NumericT>(result),
238  static_cast<unsigned int>(result.start()),
239  static_cast<unsigned int>(result.stride()),
240  static_cast<unsigned int>(result.size())
241  );
242  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
243 }
244 
249 template<typename LayoutT>
251 {
252  static __device__ unsigned int apply(unsigned int i, unsigned int j,
253  unsigned int row_start, unsigned int row_inc,
254  unsigned int col_start, unsigned int col_inc,
255  unsigned int internal_rows, unsigned int internal_cols)
256  {
257  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
258  }
259 };
260 
262 template<>
263 struct mat_mult_matrix_index<viennacl::column_major>
264 {
265  static __device__ unsigned int apply(unsigned int i, unsigned int j,
266  unsigned int row_start, unsigned int row_inc,
267  unsigned int col_start, unsigned int col_inc,
268  unsigned int internal_rows, unsigned int internal_cols)
269  {
270  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
271  }
272 };
276 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
278  const unsigned int * sp_mat_row_indices,
279  const unsigned int * sp_mat_col_indices,
280  const NumericT * sp_mat_elements,
281  const NumericT * d_mat,
282  unsigned int d_mat_row_start,
283  unsigned int d_mat_col_start,
284  unsigned int d_mat_row_inc,
285  unsigned int d_mat_col_inc,
286  unsigned int d_mat_row_size,
287  unsigned int d_mat_col_size,
288  unsigned int d_mat_internal_rows,
289  unsigned int d_mat_internal_cols,
290  NumericT * result,
291  unsigned int result_row_start,
292  unsigned int result_col_start,
293  unsigned int result_row_inc,
294  unsigned int result_col_inc,
295  unsigned int result_row_size,
296  unsigned int result_col_size,
297  unsigned int result_internal_rows,
298  unsigned int result_internal_cols)
299 {
300  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
301  {
302  unsigned int row_start = sp_mat_row_indices[row];
303  unsigned int row_end = sp_mat_row_indices[row+1];
304 
305  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
306  {
307  NumericT r = 0;
308 
309  for (unsigned int k = row_start; k < row_end; k++)
310  {
311  unsigned int j = sp_mat_col_indices[k];
312  NumericT x = sp_mat_elements[k];
313  NumericT y = d_mat[ DMatIndexT::apply(j, col,
314  d_mat_row_start, d_mat_row_inc,
315  d_mat_col_start, d_mat_col_inc,
316  d_mat_internal_rows, d_mat_internal_cols) ];
317 
318  r += x * y;
319  }
320 
321  result[ResultIndexT::apply(row, col,
322  result_row_start, result_row_inc,
323  result_col_start, result_col_inc,
324  result_internal_rows, result_internal_cols)] = r;
325  }
326  }
327 }
328 
329 
338 template<typename NumericT, unsigned int AlignmentV>
340  const viennacl::matrix_base<NumericT> & d_mat,
342 {
343  if (d_mat.row_major() && result.row_major())
344  {
345  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
346  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
347  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
348  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
349 
350  detail::cuda_arg<NumericT>(d_mat),
351  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
352  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
353  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
354  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
355 
356  detail::cuda_arg<NumericT>(result),
357  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
358  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
359  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
360  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
361  );
362  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
363  }
364  else if (d_mat.row_major() && !result.row_major())
365  {
366  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
367  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
368  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
369  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
370 
371  detail::cuda_arg<NumericT>(d_mat),
372  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
373  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
374  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
375  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
376 
377  detail::cuda_arg<NumericT>(result),
378  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
379  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
380  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
381  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
382  );
383  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
384  }
385  else if (!d_mat.row_major() && result.row_major())
386  {
387  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
388  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
389  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
390  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
391 
392  detail::cuda_arg<NumericT>(d_mat),
393  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
394  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
395  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
396  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
397 
398  detail::cuda_arg<NumericT>(result),
399  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
400  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
401  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
402  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
403  );
404  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
405  }
406  else
407  {
408  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
409  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
410  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
411  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
412 
413  detail::cuda_arg<NumericT>(d_mat),
414  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
415  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
416  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
417  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
418 
419  detail::cuda_arg<NumericT>(result),
420  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
421  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
422  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
423  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
424  );
425  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
426  }
427 }
428 
429 
430 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
432  const unsigned int * sp_mat_row_indices,
433  const unsigned int * sp_mat_col_indices,
434  const NumericT * sp_mat_elements,
435  const NumericT * d_mat,
436  unsigned int d_mat_row_start,
437  unsigned int d_mat_col_start,
438  unsigned int d_mat_row_inc,
439  unsigned int d_mat_col_inc,
440  unsigned int d_mat_row_size,
441  unsigned int d_mat_col_size,
442  unsigned int d_mat_internal_rows,
443  unsigned int d_mat_internal_cols,
444  NumericT * result,
445  unsigned int result_row_start,
446  unsigned int result_col_start,
447  unsigned int result_row_inc,
448  unsigned int result_col_inc,
449  unsigned int result_row_size,
450  unsigned int result_col_size,
451  unsigned int result_internal_rows,
452  unsigned int result_internal_cols)
453 {
454  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
455  {
456  unsigned int row_start = sp_mat_row_indices[row];
457  unsigned int row_end = sp_mat_row_indices[row+1];
458 
459  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
460  {
461  NumericT r = 0;
462 
463  for (unsigned int k = row_start; k < row_end; k++)
464  {
465  unsigned int j = sp_mat_col_indices[k];
466  NumericT x = sp_mat_elements[k];
467  NumericT y = d_mat[ DMatIndexT::apply(col, j,
468  d_mat_row_start, d_mat_row_inc,
469  d_mat_col_start, d_mat_col_inc,
470  d_mat_internal_rows, d_mat_internal_cols) ];
471 
472  r += x * y;
473  }
474 
475  result [ ResultIndexT::apply(row, col,
476  result_row_start, result_row_inc,
477  result_col_start, result_col_inc,
478  result_internal_rows, result_internal_cols) ] = r;
479  }
480  }
481 
482 }
483 
493 template<typename NumericT, unsigned int AlignmentV>
497  viennacl::op_trans > & d_mat,
499 {
500 
501  if (d_mat.lhs().row_major() && result.row_major())
502  {
503  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
504  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
505  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
506  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
507 
508  detail::cuda_arg<NumericT>(d_mat.lhs()),
509  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
510  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
511  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
512  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
513 
514  detail::cuda_arg<NumericT>(result),
515  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
516  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
517  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
518  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
519  );
520  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
521  }
522  else if (d_mat.lhs().row_major() && !result.row_major())
523  {
524  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
525  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
526  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
527  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
528 
529  detail::cuda_arg<NumericT>(d_mat.lhs()),
530  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
531  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
532  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
533  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
534 
535  detail::cuda_arg<NumericT>(result),
536  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
537  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
538  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
539  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
540  );
541  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
542  }
543  else if (!d_mat.lhs().row_major() && result.row_major())
544  {
545  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
546  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
547  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
548  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
549 
550  detail::cuda_arg<NumericT>(d_mat.lhs()),
551  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
552  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
553  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
554  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
555 
556  detail::cuda_arg<NumericT>(result),
557  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
558  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
559  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
560  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
561  );
562  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
563  }
564  else
565  {
566  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
567  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
568  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
569  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
570 
571  detail::cuda_arg<NumericT>(d_mat.lhs()),
572  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
573  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
574  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
575  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
576 
577  detail::cuda_arg<NumericT>(result),
578  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
579  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
580  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
581  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
582  );
583  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
584  }
585 }
586 
587 
588 //
589 // triangular solves for compressed_matrix
590 //
591 
592 template<typename NumericT>
594  const unsigned int * row_indices,
595  const unsigned int * column_indices,
596  const NumericT * elements,
597  NumericT * result,
598  unsigned int size)
599 {
600  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
601  row < size;
602  row += gridDim.x * blockDim.x)
603  {
604  NumericT diag = NumericT(0);
605  unsigned int row_end = row_indices[row+1];
606  for (unsigned int i = row_indices[row]; i < row_end; ++i)
607  {
608  unsigned int col_index = column_indices[i];
609  if (col_index == row)
610  {
611  diag = elements[i];
612  break;
613  }
614  }
615  result[row] = diag;
616  }
617 }
618 
619 
625 template<typename SparseMatrixT, typename NumericT>
627 inplace_solve(const SparseMatrixT & mat,
630 {
631  csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
632  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
633  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
634  detail::cuda_arg<NumericT>(vec),
635  static_cast<unsigned int>(mat.size1())
636  );
637  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
638 }
639 
640 
646 template<typename SparseMatrixT, typename NumericT>
648 inplace_solve(const SparseMatrixT & mat,
651 {
652  csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
653  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
654  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
655  detail::cuda_arg<NumericT>(vec),
656  static_cast<unsigned int>(mat.size1())
657  );
658  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
659 }
660 
661 
662 
668 template<typename SparseMatrixT, typename NumericT>
670 inplace_solve(const SparseMatrixT & mat,
673 {
674  csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
675  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
676  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
677  detail::cuda_arg<NumericT>(vec),
678  static_cast<unsigned int>(mat.size1())
679  );
680  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
681 }
682 
683 
689 template<typename SparseMatrixT, typename NumericT>
691 inplace_solve(const SparseMatrixT & mat,
694 {
695  csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
696  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
697  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
698  detail::cuda_arg<NumericT>(vec),
699  static_cast<unsigned int>(mat.size1())
700  );
701  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
702 }
703 
704 
705 
706 // transposed
707 
713 template<typename SparseMatrixT, typename NumericT>
718 {
719  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
720  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
721  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
722  detail::cuda_arg<NumericT>(vec),
723  static_cast<unsigned int>(mat.lhs().size1())
724  );
725  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
726 }
727 
728 
734 template<typename SparseMatrixT, typename NumericT>
739 {
740  viennacl::vector<NumericT> diagonal(vec.size());
741 
742  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
743  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
744  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
745  detail::cuda_arg<NumericT>(diagonal),
746  static_cast<unsigned int>(mat.size1())
747  );
748 
749  csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
750  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
751  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
752  detail::cuda_arg<NumericT>(diagonal),
753  detail::cuda_arg<NumericT>(vec),
754  static_cast<unsigned int>(mat.lhs().size1())
755  );
756  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
757 }
758 
759 
765 template<typename SparseMatrixT, typename NumericT>
770 {
771  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
772  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
773  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
774  detail::cuda_arg<NumericT>(vec),
775  static_cast<unsigned int>(mat.lhs().size1())
776  );
777  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
778 }
779 
780 
786 template<typename SparseMatrixT, typename NumericT>
791 {
792  viennacl::vector<NumericT> diagonal(vec.size());
793 
794  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
795  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
796  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
797  detail::cuda_arg<NumericT>(diagonal),
798  static_cast<unsigned int>(mat.size1())
799  );
800 
801  csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
802  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
803  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
804  detail::cuda_arg<NumericT>(diagonal),
805  detail::cuda_arg<NumericT>(vec),
806  static_cast<unsigned int>(mat.lhs().size1())
807  );
808  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
809 }
810 
811 namespace detail
812 {
813  //
814  // block solves
815  //
816  template<typename NumericT, unsigned int AlignmentV>
819  op_trans> & L,
820  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
821  vector_base<NumericT> const & /* L_diagonal */, //ignored
822  vector_base<NumericT> & vec,
824  {
825  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
826  detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
827  detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
828  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
829  detail::cuda_arg<NumericT>(vec),
830  static_cast<unsigned int>(L.lhs().size1())
831  );
832  }
833 
834 
835  template<typename NumericT, unsigned int AlignmentV>
838  op_trans> & U,
839  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
840  vector_base<NumericT> const & U_diagonal,
841  vector_base<NumericT> & vec,
843  {
844  csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
845  detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
846  detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
847  detail::cuda_arg<NumericT>(U_diagonal.handle().cuda_handle()),
848  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
849  detail::cuda_arg<NumericT>(vec),
850  static_cast<unsigned int>(U.lhs().size1())
851  );
852  }
853 
854 
855 }
856 
857 
858 //
859 // Compressed Compressed Matrix
860 //
861 
862 template<typename NumericT>
864  const unsigned int * row_jumper,
865  const unsigned int * row_indices,
866  const unsigned int * column_indices,
867  const NumericT * elements,
868  unsigned int nonzero_rows,
869  const NumericT * x,
870  unsigned int start_x,
871  unsigned int inc_x,
872  NumericT * result,
873  unsigned int start_result,
874  unsigned int inc_result,
875  unsigned int size_result)
876 {
877  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
878  i < size_result;
879  i += gridDim.x * blockDim.x)
880  {
881  result[i * inc_result + start_result] = 0;
882  }
883 
884  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
885  i < nonzero_rows;
886  i += gridDim.x * blockDim.x)
887  {
888  NumericT dot_prod = NumericT(0);
889  unsigned int row_end = row_jumper[i+1];
890  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
891  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
892  result[row_indices[i] * inc_result + start_result] = dot_prod;
893  }
894 }
895 
896 
905 template<typename NumericT>
909 {
910  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
911  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
912  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
913  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
914  static_cast<unsigned int>(mat.nnz1()),
915  detail::cuda_arg<NumericT>(vec),
916  static_cast<unsigned int>(vec.start()),
917  static_cast<unsigned int>(vec.stride()),
918  detail::cuda_arg<NumericT>(result),
919  static_cast<unsigned int>(result.start()),
920  static_cast<unsigned int>(result.stride()),
921  static_cast<unsigned int>(result.size())
922  );
923  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
924 }
925 
926 //
927 // Coordinate Matrix
928 //
929 
930 
931 namespace detail
932 {
933 
934  template<typename NumericT>
935  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
936  const NumericT * elements,
937  const unsigned int * group_boundaries,
938  NumericT * result,
939  unsigned int option)
940  {
941  __shared__ unsigned int shared_rows[128];
942  __shared__ NumericT inter_results[128];
943 
944  uint2 tmp;
945  NumericT val;
946  unsigned int last_index = blockDim.x - 1;
947  unsigned int group_start = group_boundaries[blockIdx.x];
948  unsigned int group_end = group_boundaries[blockIdx.x + 1];
949  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
950 
951  unsigned int local_index = 0;
952 
953  for (unsigned int k = 0; k < k_end; ++k)
954  {
955  local_index = group_start + k * blockDim.x + threadIdx.x;
956 
957  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
958  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
959 
960  //check for carry from previous loop run:
961  if (threadIdx.x == 0 && k > 0)
962  {
963  if (tmp.x == shared_rows[last_index])
964  {
965  switch (option)
966  {
967  case 0: //inf-norm
968  case 3: //diagonal entry
969  val = max(val, fabs(inter_results[last_index]));
970  break;
971 
972  case 1: //1-norm
973  val = fabs(val) + inter_results[last_index];
974  break;
975 
976  case 2: //2-norm
977  val = sqrt(val * val + inter_results[last_index]);
978  break;
979 
980  default:
981  break;
982  }
983  }
984  else
985  {
986  switch (option)
987  {
988  case 0: //inf-norm
989  case 1: //1-norm
990  case 3: //diagonal entry
991  result[shared_rows[last_index]] = inter_results[last_index];
992  break;
993 
994  case 2: //2-norm
995  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
996  default:
997  break;
998  }
999  }
1000  }
1001 
1002  //segmented parallel reduction begin
1003  __syncthreads();
1004  shared_rows[threadIdx.x] = tmp.x;
1005  switch (option)
1006  {
1007  case 0:
1008  case 3:
1009  inter_results[threadIdx.x] = val;
1010  break;
1011  case 1:
1012  inter_results[threadIdx.x] = fabs(val);
1013  break;
1014  case 2:
1015  inter_results[threadIdx.x] = val * val;
1016  default:
1017  break;
1018  }
1019  NumericT left = 0;
1020  __syncthreads();
1021 
1022  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1023  {
1024  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1025  __syncthreads();
1026  switch (option)
1027  {
1028  case 0: //inf-norm
1029  case 3: //diagonal entry
1030  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
1031  break;
1032 
1033  case 1: //1-norm
1034  inter_results[threadIdx.x] += left;
1035  break;
1036 
1037  case 2: //2-norm
1038  inter_results[threadIdx.x] += left;
1039  break;
1040 
1041  default:
1042  break;
1043  }
1044  __syncthreads();
1045  }
1046  //segmented parallel reduction end
1047 
1048  if (threadIdx.x != last_index &&
1049  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1050  inter_results[threadIdx.x] != 0)
1051  {
1052  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1053  }
1054 
1055  __syncthreads();
1056  } //for k
1057 
1058  if (threadIdx.x == last_index && inter_results[last_index] != 0)
1059  result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
1060  }
1061 
1062  template<typename NumericT, unsigned int AlignmentV>
1064  vector_base<NumericT> & vec,
1066  {
1067  coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
1068  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1069  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1070  detail::cuda_arg<NumericT>(vec),
1071  static_cast<unsigned int>(info_selector)
1072  );
1073  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
1074  }
1075 
1076 } //namespace detail
1077 
1078 
1079 template<typename NumericT>
1080 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1081  const NumericT * elements,
1082  const unsigned int * group_boundaries,
1083  const NumericT * x,
1084  unsigned int start_x,
1085  unsigned int inc_x,
1086  NumericT * result,
1087  unsigned int start_result,
1088  unsigned int inc_result
1089  )
1090 {
1091  __shared__ unsigned int shared_rows[128];
1092  __shared__ NumericT inter_results[128];
1093 
1094  uint2 tmp;
1095  NumericT val;
1096  unsigned int group_start = group_boundaries[blockIdx.x];
1097  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1098  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1099 
1100  unsigned int local_index = 0;
1101 
1102  for (unsigned int k = 0; k < k_end; ++k)
1103  {
1104  local_index = group_start + k * blockDim.x + threadIdx.x;
1105 
1106  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1107  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1108 
1109  //check for carry from previous loop run:
1110  if (threadIdx.x == 0 && k > 0)
1111  {
1112  if (tmp.x == shared_rows[blockDim.x-1])
1113  val += inter_results[blockDim.x-1];
1114  else
1115  result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1116  }
1117 
1118  //segmented parallel reduction begin
1119  __syncthreads();
1120  shared_rows[threadIdx.x] = tmp.x;
1121  inter_results[threadIdx.x] = val;
1122  NumericT left = 0;
1123  __syncthreads();
1124 
1125  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1126  {
1127  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1128  __syncthreads();
1129  inter_results[threadIdx.x] += left;
1130  __syncthreads();
1131  }
1132  //segmented parallel reduction end
1133 
1134  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1135  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1136  {
1137  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1138  }
1139 
1140  __syncthreads();
1141  } //for k
1142 
1143  if (local_index + 1 == group_end)
1144  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1145 }
1146 
1147 
1156 template<typename NumericT, unsigned int AlignmentV>
1158  const viennacl::vector_base<NumericT> & vec,
1160 {
1161  result.clear();
1162 
1163  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
1164  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1165  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1166  detail::cuda_arg<NumericT>(vec),
1167  static_cast<unsigned int>(vec.start()),
1168  static_cast<unsigned int>(vec.stride()),
1169  detail::cuda_arg<NumericT>(result),
1170  static_cast<unsigned int>(result.start()),
1171  static_cast<unsigned int>(result.stride())
1172  );
1173  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
1174 }
1175 
1176 
1177 
1178 
1179 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1180 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1181  const NumericT * elements,
1182  const unsigned int * group_boundaries,
1183  const NumericT * d_mat,
1184  unsigned int d_mat_row_start,
1185  unsigned int d_mat_col_start,
1186  unsigned int d_mat_row_inc,
1187  unsigned int d_mat_col_inc,
1188  unsigned int d_mat_row_size,
1189  unsigned int d_mat_col_size,
1190  unsigned int d_mat_internal_rows,
1191  unsigned int d_mat_internal_cols,
1192  NumericT * result,
1193  unsigned int result_row_start,
1194  unsigned int result_col_start,
1195  unsigned int result_row_inc,
1196  unsigned int result_col_inc,
1197  unsigned int result_row_size,
1198  unsigned int result_col_size,
1199  unsigned int result_internal_rows,
1200  unsigned int result_internal_cols)
1201 {
1202  __shared__ unsigned int shared_rows[128];
1203  __shared__ NumericT inter_results[128];
1204 
1205  uint2 tmp;
1206  NumericT val;
1207  unsigned int group_start = group_boundaries[blockIdx.x];
1208  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1209  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1210 
1211  unsigned int local_index = 0;
1212 
1213  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1214  {
1215  for (unsigned int k = 0; k < k_end; ++k)
1216  {
1217  local_index = group_start + k * blockDim.x + threadIdx.x;
1218 
1219  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1220  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1221  d_mat_row_start, d_mat_row_inc,
1222  d_mat_col_start, d_mat_col_inc,
1223  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1224 
1225  //check for carry from previous loop run:
1226  if (threadIdx.x == 0 && k > 0)
1227  {
1228  if (tmp.x == shared_rows[blockDim.x-1])
1229  val += inter_results[blockDim.x-1];
1230  else
1231  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1232  result_row_start, result_row_inc,
1233  result_col_start, result_col_inc,
1234  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1235  }
1236 
1237  //segmented parallel reduction begin
1238  __syncthreads();
1239  shared_rows[threadIdx.x] = tmp.x;
1240  inter_results[threadIdx.x] = val;
1241  NumericT left = 0;
1242  __syncthreads();
1243 
1244  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1245  {
1246  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1247  __syncthreads();
1248  inter_results[threadIdx.x] += left;
1249  __syncthreads();
1250  }
1251  //segmented parallel reduction end
1252 
1253  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1254  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1255  {
1256  result[ResultIndexT::apply(tmp.x, result_col,
1257  result_row_start, result_row_inc,
1258  result_col_start, result_col_inc,
1259  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1260  }
1261 
1262  __syncthreads();
1263  } //for k
1264 
1265  if (local_index + 1 == group_end)
1266  result[ResultIndexT::apply(tmp.x, result_col,
1267  result_row_start, result_row_inc,
1268  result_col_start, result_col_inc,
1269  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1270  }
1271 }
1272 
1273 
1282 template<typename NumericT, unsigned int AlignmentV>
1284  const viennacl::matrix_base<NumericT> & d_mat,
1286 {
1287  if (d_mat.row_major() && result.row_major())
1288  {
1289  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1290  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1291  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1292  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1293 
1294  detail::cuda_arg<NumericT>(d_mat),
1295  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1296  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1297  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1298  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1299 
1300  detail::cuda_arg<NumericT>(result),
1301  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1302  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1303  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1304  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1305  );
1306  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1307  }
1308  else if (d_mat.row_major() && !result.row_major())
1309  {
1310  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1311  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1312  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1313  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1314 
1315  detail::cuda_arg<NumericT>(d_mat),
1316  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1317  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1318  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1319  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1320 
1321  detail::cuda_arg<NumericT>(result),
1322  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1323  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1324  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1325  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1326  );
1327  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1328  }
1329  else if (!d_mat.row_major() && result.row_major())
1330  {
1331  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1332  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1333  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1334  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1335 
1336  detail::cuda_arg<NumericT>(d_mat),
1337  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1338  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1339  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1340  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1341 
1342  detail::cuda_arg<NumericT>(result),
1343  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1344  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1345  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1346  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1347  );
1348  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1349  }
1350  else
1351  {
1352  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1353  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1354  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1355  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1356 
1357  detail::cuda_arg<NumericT>(d_mat),
1358  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1359  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1360  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1361  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1362 
1363  detail::cuda_arg<NumericT>(result),
1364  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1365  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1366  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1367  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1368  );
1369  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1370  }
1371 
1372 }
1373 
1374 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1375 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1376  const NumericT * elements,
1377  const unsigned int * group_boundaries,
1378  const NumericT * d_mat,
1379  unsigned int d_mat_row_start,
1380  unsigned int d_mat_col_start,
1381  unsigned int d_mat_row_inc,
1382  unsigned int d_mat_col_inc,
1383  unsigned int d_mat_row_size,
1384  unsigned int d_mat_col_size,
1385  unsigned int d_mat_internal_rows,
1386  unsigned int d_mat_internal_cols,
1387  NumericT * result,
1388  unsigned int result_row_start,
1389  unsigned int result_col_start,
1390  unsigned int result_row_inc,
1391  unsigned int result_col_inc,
1392  unsigned int result_row_size,
1393  unsigned int result_col_size,
1394  unsigned int result_internal_rows,
1395  unsigned int result_internal_cols)
1396 {
1397  __shared__ unsigned int shared_rows[128];
1398  __shared__ NumericT inter_results[128];
1399 
1400  uint2 tmp;
1401  NumericT val;
1402  unsigned int group_start = group_boundaries[blockIdx.x];
1403  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1404  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1405 
1406  unsigned int local_index = 0;
1407 
1408  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1409  {
1410  for (unsigned int k = 0; k < k_end; ++k)
1411  {
1412  local_index = group_start + k * blockDim.x + threadIdx.x;
1413 
1414  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1415  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1416  d_mat_row_start, d_mat_row_inc,
1417  d_mat_col_start, d_mat_col_inc,
1418  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1419 
1420  //check for carry from previous loop run:
1421  if (threadIdx.x == 0 && k > 0)
1422  {
1423  if (tmp.x == shared_rows[blockDim.x-1])
1424  val += inter_results[blockDim.x-1];
1425  else
1426  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1427  result_row_start, result_row_inc,
1428  result_col_start, result_col_inc,
1429  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1430  }
1431 
1432  //segmented parallel reduction begin
1433  __syncthreads();
1434  shared_rows[threadIdx.x] = tmp.x;
1435  inter_results[threadIdx.x] = val;
1436  NumericT left = 0;
1437  __syncthreads();
1438 
1439  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1440  {
1441  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1442  __syncthreads();
1443  inter_results[threadIdx.x] += left;
1444  __syncthreads();
1445  }
1446  //segmented parallel reduction end
1447 
1448  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1449  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1450  {
1451  result[ ResultIndexT::apply(tmp.x, result_col,
1452  result_row_start, result_row_inc,
1453  result_col_start, result_col_inc,
1454  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1455  }
1456 
1457  __syncthreads();
1458  } //for k
1459 
1460  if (local_index + 1 == group_end)
1461  result[ ResultIndexT::apply(tmp.x, result_col,
1462  result_row_start, result_row_inc,
1463  result_col_start, result_col_inc,
1464  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1465  }
1466 }
1467 
1476 template<typename NumericT, unsigned int AlignmentV>
1480  viennacl::op_trans > & d_mat,
1482 {
1483  if (d_mat.lhs().row_major() && result.row_major())
1484  {
1485  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1486  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1487  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1488  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1489 
1490  detail::cuda_arg<NumericT>(d_mat.lhs()),
1491  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1492  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1493  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1494  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1495 
1496  detail::cuda_arg<NumericT>(result),
1497  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1498  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1499  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1500  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1501  );
1502  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1503  }
1504  else if (d_mat.lhs().row_major() && !result.row_major())
1505  {
1506  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1507  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1508  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1509  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1510 
1511  detail::cuda_arg<NumericT>(d_mat.lhs()),
1512  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1513  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1514  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1515  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1516 
1517  detail::cuda_arg<NumericT>(result),
1518  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1519  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1520  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1521  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1522  );
1523  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1524  }
1525  else if (!d_mat.lhs().row_major() && result.row_major())
1526  {
1527  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1528  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1529  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1530  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1531 
1532  detail::cuda_arg<NumericT>(d_mat.lhs()),
1533  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1534  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1535  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1536  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1537 
1538  detail::cuda_arg<NumericT>(result),
1539  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1540  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1541  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1542  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1543  );
1544  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1545  }
1546  else
1547  {
1548  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1549  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1550  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1551  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1552 
1553  detail::cuda_arg<NumericT>(d_mat.lhs()),
1554  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1555  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1556  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1557  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1558 
1559  detail::cuda_arg<NumericT>(result),
1560  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1561  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1562  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1563  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1564  );
1565  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1566  }
1567 }
1568 
1569 
1570 //
1571 // ELL Matrix
1572 //
1573 
1574 template<typename NumericT>
1575 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1576  const NumericT * elements,
1577  const NumericT * x,
1578  unsigned int start_x,
1579  unsigned int inc_x,
1580  NumericT * result,
1581  unsigned int start_result,
1582  unsigned int inc_result,
1583  unsigned int row_num,
1584  unsigned int col_num,
1585  unsigned int internal_row_num,
1586  unsigned int items_per_row,
1587  unsigned int aligned_items_per_row
1588  )
1589 {
1590  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1591  unsigned int glb_sz = gridDim.x * blockDim.x;
1592 
1593  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1594  {
1595  NumericT sum = 0;
1596 
1597  unsigned int offset = row_id;
1598  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1599  {
1600  NumericT val = elements[offset];
1601 
1602  if (val != NumericT(0))
1603  {
1604  int col = coords[offset];
1605  sum += x[col * inc_x + start_x] * val;
1606  }
1607  }
1608 
1609  result[row_id * inc_result + start_result] = sum;
1610  }
1611 }
1612 
1613 
1622 template<typename NumericT, unsigned int AlignmentV>
1624  const viennacl::vector_base<NumericT> & vec,
1626 {
1627  ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1628  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1629  detail::cuda_arg<NumericT>(vec),
1630  static_cast<unsigned int>(vec.start()),
1631  static_cast<unsigned int>(vec.stride()),
1632  detail::cuda_arg<NumericT>(result),
1633  static_cast<unsigned int>(result.start()),
1634  static_cast<unsigned int>(result.stride()),
1635  static_cast<unsigned int>(mat.size1()),
1636  static_cast<unsigned int>(mat.size2()),
1637  static_cast<unsigned int>(mat.internal_size1()),
1638  static_cast<unsigned int>(mat.maxnnz()),
1639  static_cast<unsigned int>(mat.internal_maxnnz())
1640  );
1641  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1642 }
1643 
1644 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1645 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1646  const NumericT * sp_mat_elements,
1647  unsigned int sp_mat_row_num,
1648  unsigned int sp_mat_col_num,
1649  unsigned int sp_mat_internal_row_num,
1650  unsigned int sp_mat_items_per_row,
1651  unsigned int sp_mat_aligned_items_per_row,
1652  const NumericT * d_mat,
1653  unsigned int d_mat_row_start,
1654  unsigned int d_mat_col_start,
1655  unsigned int d_mat_row_inc,
1656  unsigned int d_mat_col_inc,
1657  unsigned int d_mat_row_size,
1658  unsigned int d_mat_col_size,
1659  unsigned int d_mat_internal_rows,
1660  unsigned int d_mat_internal_cols,
1661  NumericT * result,
1662  unsigned int result_row_start,
1663  unsigned int result_col_start,
1664  unsigned int result_row_inc,
1665  unsigned int result_col_inc,
1666  unsigned int result_row_size,
1667  unsigned int result_col_size,
1668  unsigned int result_internal_rows,
1669  unsigned int result_internal_cols)
1670 {
1671  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1672  unsigned int glb_sz = gridDim.x * blockDim.x;
1673 
1674  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1675  {
1676  unsigned int row = rc % sp_mat_row_num;
1677  unsigned int col = rc / sp_mat_row_num;
1678 
1679  unsigned int offset = row;
1680  NumericT r = (NumericT)0;
1681 
1682  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1683  {
1684  unsigned int j = sp_mat_coords[offset];
1685  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1686 
1687  if (x != (NumericT)0)
1688  {
1689  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1690  d_mat_row_start, d_mat_row_inc,
1691  d_mat_col_start, d_mat_col_inc,
1692  d_mat_internal_rows, d_mat_internal_cols) ];
1693 
1694  r += x*y;
1695  }
1696  }
1697  result [ ResultIndexT::apply(row, col,
1698  result_row_start, result_row_inc,
1699  result_col_start, result_col_inc,
1700  result_internal_rows, result_internal_cols) ] = r;
1701  }
1702 
1703 }
1704 
1714 template<typename NumericT, unsigned int AlignmentV>
1716  const viennacl::matrix_base<NumericT> & d_mat,
1718 {
1719  if (d_mat.row_major() && result.row_major())
1720  {
1721  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1722  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1723  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1724  static_cast<unsigned int>(sp_mat.size1()),
1725  static_cast<unsigned int>(sp_mat.size2()),
1726  static_cast<unsigned int>(sp_mat.internal_size1()),
1727  static_cast<unsigned int>(sp_mat.maxnnz()),
1728  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1729  detail::cuda_arg<NumericT>(d_mat),
1730  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1731  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1732  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1733  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1734 
1735  detail::cuda_arg<NumericT>(result),
1736  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1737  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1738  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1739  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1740  );
1741  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1742  }
1743  else if (d_mat.row_major() && !result.row_major())
1744  {
1745  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1746  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1747  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1748  static_cast<unsigned int>(sp_mat.size1()),
1749  static_cast<unsigned int>(sp_mat.size2()),
1750  static_cast<unsigned int>(sp_mat.internal_size1()),
1751  static_cast<unsigned int>(sp_mat.maxnnz()),
1752  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1753  detail::cuda_arg<NumericT>(d_mat),
1754  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1755  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1756  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1757  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1758 
1759  detail::cuda_arg<NumericT>(result),
1760  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1761  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1762  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1763  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1764  );
1765  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1766  }
1767  else if (!d_mat.row_major() && result.row_major())
1768  {
1769  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1770  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1771  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1772  static_cast<unsigned int>(sp_mat.size1()),
1773  static_cast<unsigned int>(sp_mat.size2()),
1774  static_cast<unsigned int>(sp_mat.internal_size1()),
1775  static_cast<unsigned int>(sp_mat.maxnnz()),
1776  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1777  detail::cuda_arg<NumericT>(d_mat),
1778  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1779  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1780  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1781  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1782 
1783  detail::cuda_arg<NumericT>(result),
1784  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1785  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1786  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1787  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1788  );
1789  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1790  }
1791  else
1792  {
1793  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1794  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1795  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1796  static_cast<unsigned int>(sp_mat.size1()),
1797  static_cast<unsigned int>(sp_mat.size2()),
1798  static_cast<unsigned int>(sp_mat.internal_size1()),
1799  static_cast<unsigned int>(sp_mat.maxnnz()),
1800  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1801  detail::cuda_arg<NumericT>(d_mat),
1802  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1803  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1804  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1805  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1806 
1807  detail::cuda_arg<NumericT>(result),
1808  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1809  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1810  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1811  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1812  );
1813  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1814  }
1815 }
1816 
1817 template<typename DMatIndexT, typename ResultIndexT, typename NumericT >
1818 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
1819  const NumericT * sp_mat_elements,
1820  unsigned int sp_mat_row_num,
1821  unsigned int sp_mat_col_num,
1822  unsigned int sp_mat_internal_row_num,
1823  unsigned int sp_mat_items_per_row,
1824  unsigned int sp_mat_aligned_items_per_row,
1825  const NumericT * d_mat,
1826  unsigned int d_mat_row_start,
1827  unsigned int d_mat_col_start,
1828  unsigned int d_mat_row_inc,
1829  unsigned int d_mat_col_inc,
1830  unsigned int d_mat_row_size,
1831  unsigned int d_mat_col_size,
1832  unsigned int d_mat_internal_rows,
1833  unsigned int d_mat_internal_cols,
1834  NumericT * result,
1835  unsigned int result_row_start,
1836  unsigned int result_col_start,
1837  unsigned int result_row_inc,
1838  unsigned int result_col_inc,
1839  unsigned int result_row_size,
1840  unsigned int result_col_size,
1841  unsigned int result_internal_rows,
1842  unsigned int result_internal_cols)
1843 {
1844  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1845  unsigned int glb_sz = gridDim.x * blockDim.x;
1846 
1847  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1848  {
1849  unsigned int row = rc % sp_mat_row_num;
1850  unsigned int col = rc / sp_mat_row_num;
1851 
1852  unsigned int offset = row;
1853  NumericT r = (NumericT)0;
1854 
1855  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1856  {
1857  unsigned int j = sp_mat_coords[offset];
1858  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1859 
1860  if (x != (NumericT)0)
1861  {
1862  NumericT y = d_mat[ DMatIndexT::apply(col, j,
1863  d_mat_row_start, d_mat_row_inc,
1864  d_mat_col_start, d_mat_col_inc,
1865  d_mat_internal_rows, d_mat_internal_cols) ];
1866 
1867  r += x*y;
1868  }
1869  }
1870  result [ ResultIndexT::apply(row, col,
1871  result_row_start, result_row_inc,
1872  result_col_start, result_col_inc,
1873  result_internal_rows, result_internal_cols) ] = r;
1874  }
1875 
1876 }
1877 
1887 template<typename NumericT, unsigned int AlignmentV>
1891  viennacl::op_trans > & d_mat,
1893 {
1894  if (d_mat.lhs().row_major() && result.row_major())
1895  {
1896  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1897  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1898  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1899  static_cast<unsigned int>(sp_mat.size1()),
1900  static_cast<unsigned int>(sp_mat.size2()),
1901  static_cast<unsigned int>(sp_mat.internal_size1()),
1902  static_cast<unsigned int>(sp_mat.maxnnz()),
1903  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1904 
1905  detail::cuda_arg<NumericT>(d_mat.lhs()),
1906  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1907  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1908  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1909  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1910 
1911  detail::cuda_arg<NumericT>(result),
1912  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1913  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1914  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1915  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1916  );
1917  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1918  }
1919  else if (d_mat.lhs().row_major() && !result.row_major())
1920  {
1921  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1922  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1923  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1924  static_cast<unsigned int>(sp_mat.size1()),
1925  static_cast<unsigned int>(sp_mat.size2()),
1926  static_cast<unsigned int>(sp_mat.internal_size1()),
1927  static_cast<unsigned int>(sp_mat.maxnnz()),
1928  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1929 
1930  detail::cuda_arg<NumericT>(d_mat.lhs()),
1931  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1932  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1933  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1934  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1935 
1936  detail::cuda_arg<NumericT>(result),
1937  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1938  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1939  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1940  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1941  );
1942  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1943  }
1944  else if (!d_mat.lhs().row_major() && result.row_major())
1945  {
1946  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1947  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1948  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1949  static_cast<unsigned int>(sp_mat.size1()),
1950  static_cast<unsigned int>(sp_mat.size2()),
1951  static_cast<unsigned int>(sp_mat.internal_size1()),
1952  static_cast<unsigned int>(sp_mat.maxnnz()),
1953  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1954 
1955  detail::cuda_arg<NumericT>(d_mat.lhs()),
1956  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1957  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1958  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1959  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1960 
1961  detail::cuda_arg<NumericT>(result),
1962  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1963  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1964  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1965  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1966  );
1967  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1968  }
1969  else
1970  {
1971  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1972  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1973  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1974  static_cast<unsigned int>(sp_mat.size1()),
1975  static_cast<unsigned int>(sp_mat.size2()),
1976  static_cast<unsigned int>(sp_mat.internal_size1()),
1977  static_cast<unsigned int>(sp_mat.maxnnz()),
1978  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1979 
1980  detail::cuda_arg<NumericT>(d_mat.lhs()),
1981  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1982  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1983  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1984  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1985 
1986  detail::cuda_arg<NumericT>(result),
1987  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1988  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1989  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1990  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1991  );
1992  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1993  }
1994 }
1995 
1996 //
1997 // SELL-C-\sigma Matrix
1998 //
1999 
2000 template<typename NumericT>
2001 __global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int * columns_per_block,
2002  const unsigned int * column_indices,
2003  const unsigned int * block_start,
2004  const NumericT * elements,
2005  const NumericT * x,
2006  unsigned int start_x,
2007  unsigned int inc_x,
2008  unsigned int size_x,
2009  NumericT * result,
2010  unsigned int start_result,
2011  unsigned int inc_result,
2012  unsigned int size_result)
2013 {
2014  unsigned int local_id = threadIdx.x;
2015  unsigned int local_size = blockDim.x;
2016  unsigned int num_rows = size_result;
2017 
2018  for (unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
2019  {
2020  unsigned int row = block_idx * local_size + local_id;
2021  unsigned int offset = block_start[block_idx];
2022  unsigned int num_columns = columns_per_block[block_idx];
2023 
2024  NumericT sum = 0;
2025  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
2026  {
2027  unsigned int index = offset + item_id * local_size + local_id;
2028  NumericT val = elements[index];
2029 
2030  sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2031  }
2032 
2033  if (row < num_rows)
2034  result[row * inc_result + start_result] = sum;
2035  }
2036 }
2037 
2046 template<typename NumericT, typename IndexT>
2048  const viennacl::vector_base<NumericT> & vec,
2050 {
2051  sliced_ell_matrix_vec_mul_kernel<<<128, mat.rows_per_block()>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
2052  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2053  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2054  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2055  detail::cuda_arg<NumericT>(vec),
2056  static_cast<unsigned int>(vec.start()),
2057  static_cast<unsigned int>(vec.stride()),
2058  static_cast<unsigned int>(vec.size()),
2059  detail::cuda_arg<NumericT>(result),
2060  static_cast<unsigned int>(result.start()),
2061  static_cast<unsigned int>(result.stride()),
2062  static_cast<unsigned int>(result.size())
2063  );
2064  VIENNACL_CUDA_LAST_ERROR_CHECK("sliced_ell_matrix_vec_mul_kernel");
2065 }
2066 
2067 
2068 //
2069 // Hybrid Matrix
2070 //
2071 
2072 
2073 template<typename NumericT>
2074 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
2075  const NumericT * ell_elements,
2076  const unsigned int * csr_rows,
2077  const unsigned int * csr_cols,
2078  const NumericT * csr_elements,
2079  const NumericT * x,
2080  unsigned int start_x,
2081  unsigned int inc_x,
2082  NumericT * result,
2083  unsigned int start_result,
2084  unsigned int inc_result,
2085  unsigned int row_num,
2086  unsigned int internal_row_num,
2087  unsigned int items_per_row,
2088  unsigned int aligned_items_per_row
2089  )
2090 {
2091  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2092  unsigned int glb_sz = gridDim.x * blockDim.x;
2093 
2094  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2095  {
2096  NumericT sum = 0;
2097 
2098  unsigned int offset = row_id;
2099  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2100  {
2101  NumericT val = ell_elements[offset];
2102 
2103 
2104  if (val != NumericT(0))
2105  {
2106  int col = ell_coords[offset];
2107  sum += (x[col * inc_x + start_x] * val);
2108  }
2109  }
2110 
2111  unsigned int col_begin = csr_rows[row_id];
2112  unsigned int col_end = csr_rows[row_id + 1];
2113 
2114  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2115  sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2116 
2117  result[row_id * inc_result + start_result] = sum;
2118  }
2119 }
2120 
2121 
2122 
2131 template<typename NumericT, unsigned int AlignmentV>
2133  const viennacl::vector_base<NumericT> & vec,
2135 {
2136  hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2137  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2138  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2139  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2140  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2141  detail::cuda_arg<NumericT>(vec),
2142  static_cast<unsigned int>(vec.start()),
2143  static_cast<unsigned int>(vec.stride()),
2144  detail::cuda_arg<NumericT>(result),
2145  static_cast<unsigned int>(result.start()),
2146  static_cast<unsigned int>(result.stride()),
2147  static_cast<unsigned int>(mat.size1()),
2148  static_cast<unsigned int>(mat.internal_size1()),
2149  static_cast<unsigned int>(mat.ell_nnz()),
2150  static_cast<unsigned int>(mat.internal_ellnnz())
2151  );
2152  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2153 }
2154 
2155 
2156 
2157 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2158 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
2159  const NumericT * ell_elements,
2160  const unsigned int * csr_rows,
2161  const unsigned int * csr_cols,
2162  const NumericT * csr_elements,
2163  unsigned int row_num,
2164  unsigned int internal_row_num,
2165  unsigned int items_per_row,
2166  unsigned int aligned_items_per_row,
2167  const NumericT * d_mat,
2168  unsigned int d_mat_row_start,
2169  unsigned int d_mat_col_start,
2170  unsigned int d_mat_row_inc,
2171  unsigned int d_mat_col_inc,
2172  unsigned int d_mat_row_size,
2173  unsigned int d_mat_col_size,
2174  unsigned int d_mat_internal_rows,
2175  unsigned int d_mat_internal_cols,
2176  NumericT * result,
2177  unsigned int result_row_start,
2178  unsigned int result_col_start,
2179  unsigned int result_row_inc,
2180  unsigned int result_col_inc,
2181  unsigned int result_row_size,
2182  unsigned int result_col_size,
2183  unsigned int result_internal_rows,
2184  unsigned int result_internal_cols)
2185 {
2186  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2187  unsigned int glb_sz = gridDim.x * blockDim.x;
2188 
2189  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2190  {
2191  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2192  {
2193  NumericT sum = 0;
2194 
2195  unsigned int offset = row_id;
2196  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2197  {
2198  NumericT val = ell_elements[offset];
2199 
2200  if (val != 0.0f)
2201  {
2202  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2203  d_mat_row_start, d_mat_row_inc,
2204  d_mat_col_start, d_mat_col_inc,
2205  d_mat_internal_rows, d_mat_internal_cols)] * val;
2206  }
2207  }
2208 
2209  unsigned int col_begin = csr_rows[row_id];
2210  unsigned int col_end = csr_rows[row_id + 1];
2211 
2212  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2213  {
2214  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2215  d_mat_row_start, d_mat_row_inc,
2216  d_mat_col_start, d_mat_col_inc,
2217  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2218  }
2219 
2220  result[ResultIndexT::apply(row_id, result_col,
2221  result_row_start, result_row_inc,
2222  result_col_start, result_col_inc,
2223  result_internal_rows, result_internal_cols)] = sum;
2224  }
2225  }
2226 }
2227 
2228 
2229 
2238 template<typename NumericT, unsigned int AlignmentV>
2240  const viennacl::matrix_base<NumericT> & d_mat,
2242 {
2243  if (d_mat.row_major() && result.row_major())
2244  {
2245  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2246  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2247  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2248  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2249  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2250  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2251  static_cast<unsigned int>(mat.size1()),
2252  static_cast<unsigned int>(mat.internal_size1()),
2253  static_cast<unsigned int>(mat.ell_nnz()),
2254  static_cast<unsigned int>(mat.internal_ellnnz()),
2255 
2256  detail::cuda_arg<NumericT>(d_mat),
2257  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2258  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2259  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2260  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2261 
2262  detail::cuda_arg<NumericT>(result),
2263  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2264  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2265  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2266  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2267  );
2268  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2269  }
2270  else if (d_mat.row_major() && !result.row_major())
2271  {
2272  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2273  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2274  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2275  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2276  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2277  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2278  static_cast<unsigned int>(mat.size1()),
2279  static_cast<unsigned int>(mat.internal_size1()),
2280  static_cast<unsigned int>(mat.ell_nnz()),
2281  static_cast<unsigned int>(mat.internal_ellnnz()),
2282 
2283  detail::cuda_arg<NumericT>(d_mat),
2284  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2285  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2286  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2287  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2288 
2289  detail::cuda_arg<NumericT>(result),
2290  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2291  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2292  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2293  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2294  );
2295  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2296  }
2297  else if (!d_mat.row_major() && result.row_major())
2298  {
2299  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2300  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2301  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2302  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2303  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2304  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2305  static_cast<unsigned int>(mat.size1()),
2306  static_cast<unsigned int>(mat.internal_size1()),
2307  static_cast<unsigned int>(mat.ell_nnz()),
2308  static_cast<unsigned int>(mat.internal_ellnnz()),
2309 
2310  detail::cuda_arg<NumericT>(d_mat),
2311  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2312  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2313  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2314  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2315 
2316  detail::cuda_arg<NumericT>(result),
2317  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2318  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2319  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2320  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2321  );
2322  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2323  }
2324  else
2325  {
2326  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2327  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2328  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2329  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2330  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2331  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2332  static_cast<unsigned int>(mat.size1()),
2333  static_cast<unsigned int>(mat.internal_size1()),
2334  static_cast<unsigned int>(mat.ell_nnz()),
2335  static_cast<unsigned int>(mat.internal_ellnnz()),
2336 
2337  detail::cuda_arg<NumericT>(d_mat),
2338  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2339  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2340  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2341  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2342 
2343  detail::cuda_arg<NumericT>(result),
2344  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2345  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2346  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2347  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2348  );
2349  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2350  }
2351 }
2352 
2353 
2354 
2355 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2356 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
2357  const NumericT * ell_elements,
2358  const unsigned int * csr_rows,
2359  const unsigned int * csr_cols,
2360  const NumericT * csr_elements,
2361  unsigned int row_num,
2362  unsigned int internal_row_num,
2363  unsigned int items_per_row,
2364  unsigned int aligned_items_per_row,
2365  const NumericT * d_mat,
2366  unsigned int d_mat_row_start,
2367  unsigned int d_mat_col_start,
2368  unsigned int d_mat_row_inc,
2369  unsigned int d_mat_col_inc,
2370  unsigned int d_mat_row_size,
2371  unsigned int d_mat_col_size,
2372  unsigned int d_mat_internal_rows,
2373  unsigned int d_mat_internal_cols,
2374  NumericT * result,
2375  unsigned int result_row_start,
2376  unsigned int result_col_start,
2377  unsigned int result_row_inc,
2378  unsigned int result_col_inc,
2379  unsigned int result_row_size,
2380  unsigned int result_col_size,
2381  unsigned int result_internal_rows,
2382  unsigned int result_internal_cols)
2383 {
2384  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2385  unsigned int glb_sz = gridDim.x * blockDim.x;
2386 
2387  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2388  {
2389  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2390  {
2391  NumericT sum = 0;
2392 
2393  unsigned int offset = row_id;
2394  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2395  {
2396  NumericT val = ell_elements[offset];
2397 
2398  if (val != 0.0f)
2399  {
2400  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2401  d_mat_row_start, d_mat_row_inc,
2402  d_mat_col_start, d_mat_col_inc,
2403  d_mat_internal_rows, d_mat_internal_cols)] * val;
2404  }
2405  }
2406 
2407  unsigned int col_begin = csr_rows[row_id];
2408  unsigned int col_end = csr_rows[row_id + 1];
2409 
2410  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2411  {
2412  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2413  d_mat_row_start, d_mat_row_inc,
2414  d_mat_col_start, d_mat_col_inc,
2415  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2416  }
2417 
2418  result[ResultIndexT::apply(row_id, result_col,
2419  result_row_start, result_row_inc,
2420  result_col_start, result_col_inc,
2421  result_internal_rows, result_internal_cols)] = sum;
2422  }
2423  }
2424 }
2425 
2426 
2427 
2436 template<typename NumericT, unsigned int AlignmentV>
2440  viennacl::op_trans > & d_mat,
2442 {
2443  if (d_mat.lhs().row_major() && result.row_major())
2444  {
2445  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2446  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2447  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2448  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2449  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2450  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2451  static_cast<unsigned int>(mat.size1()),
2452  static_cast<unsigned int>(mat.internal_size1()),
2453  static_cast<unsigned int>(mat.ell_nnz()),
2454  static_cast<unsigned int>(mat.internal_ellnnz()),
2455 
2456  detail::cuda_arg<NumericT>(d_mat.lhs()),
2457  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2458  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2459  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2460  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2461 
2462  detail::cuda_arg<NumericT>(result),
2463  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2464  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2465  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2466  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2467  );
2468  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2469  }
2470  else if (d_mat.lhs().row_major() && !result.row_major())
2471  {
2472  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2473  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2474  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2475  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2476  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2477  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2478  static_cast<unsigned int>(mat.size1()),
2479  static_cast<unsigned int>(mat.internal_size1()),
2480  static_cast<unsigned int>(mat.ell_nnz()),
2481  static_cast<unsigned int>(mat.internal_ellnnz()),
2482 
2483  detail::cuda_arg<NumericT>(d_mat.lhs()),
2484  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2485  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2486  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2487  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2488 
2489  detail::cuda_arg<NumericT>(result),
2490  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2491  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2492  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2493  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2494  );
2495  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2496  }
2497  else if (!d_mat.lhs().row_major() && result.row_major())
2498  {
2499  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2500  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2501  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2502  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2503  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2504  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2505  static_cast<unsigned int>(mat.size1()),
2506  static_cast<unsigned int>(mat.internal_size1()),
2507  static_cast<unsigned int>(mat.ell_nnz()),
2508  static_cast<unsigned int>(mat.internal_ellnnz()),
2509 
2510  detail::cuda_arg<NumericT>(d_mat.lhs()),
2511  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2512  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2513  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2514  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2515 
2516  detail::cuda_arg<NumericT>(result),
2517  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2518  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2519  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2520  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2521  );
2522  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2523  }
2524  else
2525  {
2526  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2527  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2528  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2529  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2530  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2531  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2532  static_cast<unsigned int>(mat.size1()),
2533  static_cast<unsigned int>(mat.internal_size1()),
2534  static_cast<unsigned int>(mat.ell_nnz()),
2535  static_cast<unsigned int>(mat.internal_ellnnz()),
2536 
2537  detail::cuda_arg<NumericT>(d_mat.lhs()),
2538  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2539  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2540  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2541  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2542 
2543  detail::cuda_arg<NumericT>(result),
2544  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2545  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2546  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2547  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2548  );
2549  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2550  }
2551 }
2552 
2553 
2554 } // namespace cuda
2555 } //namespace linalg
2556 } //namespace viennacl
2557 
2558 
2559 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:30
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:809
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:340
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Definition: preset.hpp:246
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:72
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Definition: blas3.hpp:36
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
A tag class representing an upper triangular matrix.
Definition: forwards.h:814
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:74
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:838
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
handle_type & handle()
Definition: ell_matrix.hpp:100
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:239
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:819
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
Definition: forwards.h:219
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
A tag for column-major storage of a dense matrix.
Definition: forwards.h:320
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:66
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:824
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)