ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
29 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
37 
39 
45 
46 namespace viennacl
47 {
48 namespace linalg
49 {
50 namespace cuda
51 {
52 //
53 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
54 //
55 
56 template<typename NumericT, typename SizeT, typename DistanceT>
58  matrix_base<NumericT> & temp_trans)
59 {
60  trans_kernel<<<128,128>>>(detail::cuda_arg<NumericT>(proxy.lhs()),
61  static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()),
62  static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()),
63  static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()),
64  static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()),
65 
66  detail::cuda_arg<NumericT>(temp_trans),
67  static_cast<unsigned int>(temp_trans.start1()), static_cast<unsigned int>(temp_trans.start2()),
68  static_cast<unsigned int>(temp_trans.internal_size1()), static_cast<unsigned int>(temp_trans.internal_size2()),
69  static_cast<unsigned int>(temp_trans.stride1()), static_cast<unsigned int>(temp_trans.stride2()),
70  static_cast<bool>(proxy.lhs().row_major()));
71  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
72 }
73 
74 
75 template<typename NumericT, typename ScalarT>
77  matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
78 {
79  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
80 
81  typedef NumericT value_type;
82 
83  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
84 
85  value_type temporary_alpha = 0;
87  temporary_alpha = alpha;
88 
89  if (mat1.row_major())
90  {
91  am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
92  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
93  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
94  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
95  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
96 
97  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
98  options_alpha,
99  detail::cuda_arg<value_type>(mat2),
100  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
101  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
102  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
103  );
104  VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
105  }
106  else
107  {
108  am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
109  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
110  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
111  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
112  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
113 
114  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
115  options_alpha,
116  detail::cuda_arg<value_type>(mat2),
117  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
118  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
119  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
120  );
121  VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
122  }
123 }
124 
125 
126 template<typename NumericT, typename ScalarT1, typename ScalarT2>
128  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
129  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
130 {
131  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
132 
133  typedef NumericT value_type;
134 
135  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
136 
137  value_type temporary_alpha = 0;
139  temporary_alpha = alpha;
140 
141 
142  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
143 
144  value_type temporary_beta = 0;
146  temporary_beta = beta;
147 
148 
149  if (mat1.row_major())
150  {
151  ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
152  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
153  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
154  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
155  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
156 
157  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
158  options_alpha,
159  detail::cuda_arg<value_type>(mat2),
160  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
161  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
162  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
163 
164  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
165  options_beta,
166  detail::cuda_arg<value_type>(mat3),
167  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
168  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
169  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
170  );
171  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
172  }
173  else
174  {
175  ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
176  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
177  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
178  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
179  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
180 
181  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
182  options_alpha,
183  detail::cuda_arg<value_type>(mat2),
184  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
185  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
186  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
187 
188  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
189  options_beta,
190  detail::cuda_arg<value_type>(mat3),
191  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
192  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
193  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
194  );
195  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
196  }
197 
198 }
199 
200 
201 template<typename NumericT, typename ScalarT1, typename ScalarT2>
203  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
204  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
205 {
206  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
207 
208  typedef NumericT value_type;
209 
210  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
211 
212  value_type temporary_alpha = 0;
214  temporary_alpha = alpha;
215 
216 
217  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
218 
219  value_type temporary_beta = 0;
221  temporary_beta = beta;
222 
223 
224  if (mat1.row_major())
225  {
226  ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
227  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
228  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
229  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
230  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
231 
232  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
233  options_alpha,
234  detail::cuda_arg<value_type>(mat2),
235  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
236  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
237  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
238 
239  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
240  options_beta,
241  detail::cuda_arg<value_type>(mat3),
242  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
243  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
244  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
245  );
246  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
247  }
248  else
249  {
250  ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
251  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
252  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
253  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
254  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
255 
256  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
257  options_alpha,
258  detail::cuda_arg<value_type>(mat2),
259  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
260  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
261  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
262 
263  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
264  options_beta,
265  detail::cuda_arg<value_type>(mat3),
266  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
267  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
268  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
269  );
270  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
271  }
272 
273 }
274 
275 
276 
277 
278 template<typename NumericT>
279 void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
280 {
281  typedef NumericT value_type;
282  value_type alpha = s;
283 
286 
287  if (mat.row_major())
288  {
289 
290  matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
291  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
292  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
293  s1, s2,
294  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
295  alpha);
296  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
297  }
298  else
299  {
300  matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
301  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
302  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
303  s1, s2,
304  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
305  alpha);
306  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
307  }
308 }
309 
310 template<typename NumericT>
312 {
313  typedef NumericT value_type;
314  value_type alpha = s;
315 
316  if (mat.row_major())
317  {
318  matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
319  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
320  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
321  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
322  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
323  alpha);
324  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
325  }
326  else
327  {
328  matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
329  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
330  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
331  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
332  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
333  alpha);
334  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
335  }
336 }
337 
338 
339 template<typename NumericT>
341 {
342  typedef NumericT value_type;
343 
344  // Step 1: assign zero matrix:
345  matrix_assign(mat, NumericT(0));
346 
347  // Step 2: Assign diagonal:
348  unsigned int options_alpha = 0;
349 
350  vcl_size_t mat_start = 0;
351  vcl_size_t mat_stride = 0;
352  vcl_size_t mat_size = viennacl::traits::size(vec);
353  if (mat.row_major())
354  {
355  vcl_size_t first_row_index = 0;
356  vcl_size_t first_col_index = 0;
357  if (k < 0)
358  first_row_index = vcl_size_t(-k);
359  else
360  first_col_index = vcl_size_t(k);
361  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
362  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
364  }
365  else
366  {
367  vcl_size_t first_row_index = 0;
368  vcl_size_t first_col_index = 0;
369  if (k < 0)
370  first_row_index = vcl_size_t(-k);
371  else
372  first_col_index = vcl_size_t(k);
373  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
376  }
377 
378  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
379  static_cast<unsigned int>(mat_start),
380  static_cast<unsigned int>(mat_stride),
381  static_cast<unsigned int>(mat_size),
382 
383  detail::cuda_arg<value_type>(NumericT(1)),
384  options_alpha,
385  detail::cuda_arg<value_type>(vec),
386  static_cast<unsigned int>(viennacl::traits::start(vec)),
387  static_cast<unsigned int>(viennacl::traits::stride(vec)) );
388  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
389 }
390 
391 template<typename NumericT>
393 {
394  typedef NumericT value_type;
395 
396  unsigned int options_alpha = 0;
397 
398  vcl_size_t mat_start = 0;
399  vcl_size_t mat_stride = 0;
400  if (mat.row_major())
401  {
402  vcl_size_t first_row_index = 0;
403  vcl_size_t first_col_index = 0;
404  if (k < 0)
405  first_row_index = vcl_size_t(-k);
406  else
407  first_col_index = vcl_size_t(k);
408  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
409  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
411  }
412  else
413  {
414  vcl_size_t first_row_index = 0;
415  vcl_size_t first_col_index = 0;
416  if (k < 0)
417  first_row_index = vcl_size_t(-k);
418  else
419  first_col_index = vcl_size_t(k);
420  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
423  }
424 
425  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
426  static_cast<unsigned int>(viennacl::traits::start(vec)),
427  static_cast<unsigned int>(viennacl::traits::stride(vec)),
428  static_cast<unsigned int>(viennacl::traits::size(vec)),
429 
430  detail::cuda_arg<value_type>(NumericT(1)),
431  options_alpha,
432  detail::cuda_arg<value_type>(mat),
433  static_cast<unsigned int>(mat_start),
434  static_cast<unsigned int>(mat_stride));
435  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
436 }
437 
438 template<typename NumericT>
439 void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
440 {
441  typedef NumericT value_type;
442 
443  unsigned int options_alpha = 0;
444 
445  vcl_size_t mat_start = 0;
446  vcl_size_t mat_stride = 0;
447  if (mat.row_major())
448  {
450  mat_stride = viennacl::traits::stride2(mat);
451  }
452  else
453  {
456  }
457 
458  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
459  static_cast<unsigned int>(viennacl::traits::start(vec)),
460  static_cast<unsigned int>(viennacl::traits::stride(vec)),
461  static_cast<unsigned int>(viennacl::traits::size(vec)),
462 
463  detail::cuda_arg<value_type>(NumericT(1)),
464  options_alpha,
465  detail::cuda_arg<value_type>(mat),
466  static_cast<unsigned int>(mat_start),
467  static_cast<unsigned int>(mat_stride));
468  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
469 }
470 
471 template<typename NumericT>
472 void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
473 {
474  typedef NumericT value_type;
475 
476  unsigned int options_alpha = 0;
477 
478  vcl_size_t mat_start = 0;
479  vcl_size_t mat_stride = 0;
480  if (mat.row_major())
481  {
484  }
485  else
486  {
488  mat_stride = viennacl::traits::stride2(mat);
489  }
490 
491  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
492  static_cast<unsigned int>(viennacl::traits::start(vec)),
493  static_cast<unsigned int>(viennacl::traits::stride(vec)),
494  static_cast<unsigned int>(viennacl::traits::size(vec)),
495 
496  detail::cuda_arg<value_type>(NumericT(1)),
497  options_alpha,
498  detail::cuda_arg<value_type>(mat),
499  static_cast<unsigned int>(mat_start),
500  static_cast<unsigned int>(mat_stride));
501  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
502 }
503 
504 
505 //
507 //
508 
509 
510 template<typename NumericT, typename SizeT, typename OpT>
513 {
514  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
515 
516  typedef NumericT value_type;
517 
518  unsigned int op_type = 2; //0: product, 1: division, 2: power
520  op_type = 1;
522  op_type = 0;
523 
524  if (A.row_major())
525  {
526  element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
527  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
528  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
529  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
530  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
531 
532  detail::cuda_arg<value_type>(proxy.lhs()),
533  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
534  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
535  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
536 
537  detail::cuda_arg<value_type>(proxy.rhs()),
538  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
539  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
540  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
541 
542  op_type
543  );
544  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
545  }
546  else
547  {
548  element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
549  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
550  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
551  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
552  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
553 
554  detail::cuda_arg<value_type>(proxy.lhs()),
555  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
556  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
557  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
558 
559  detail::cuda_arg<value_type>(proxy.rhs()),
560  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
561  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
562  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
563 
564  op_type
565  );
566  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
567  }
568 }
569 
570 template<typename SizeT, typename OpT>
573 {
574  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
575 
576  typedef float value_type;
577 
578  unsigned int op_type = 2; //0: product, 1: division, 2: power
580  op_type = 1;
582  op_type = 0;
583 
584  if (A.row_major())
585  {
586  element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
587  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
588  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
589  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
590  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
591 
592  detail::cuda_arg<value_type>(proxy.lhs()),
593  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
594  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
595  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
596 
597  detail::cuda_arg<value_type>(proxy.rhs()),
598  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
599  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
600  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
601 
602  op_type
603  );
604  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
605  }
606  else
607  {
608  element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
609  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
610  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
611  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
612  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
613 
614  detail::cuda_arg<value_type>(proxy.lhs()),
615  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
616  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
617  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
618 
619  detail::cuda_arg<value_type>(proxy.rhs()),
620  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
621  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
622  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
623 
624  op_type
625  );
626  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
627  }
628 }
629 
630 template<typename SizeT, typename OpT>
633 {
634  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
635 
636  typedef double value_type;
637 
638  unsigned int op_type = 2; //0: product, 1: division, 2: power
640  op_type = 1;
642  op_type = 0;
643 
644  if (A.row_major())
645  {
646  element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
647  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
648  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
649  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
650  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
651 
652  detail::cuda_arg<value_type>(proxy.lhs()),
653  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
654  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
655  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
656 
657  detail::cuda_arg<value_type>(proxy.rhs()),
658  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
659  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
660  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
661 
662  op_type
663  );
664  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
665  }
666  else
667  {
668  element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
669  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
670  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
671  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
672  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
673 
674  detail::cuda_arg<value_type>(proxy.lhs()),
675  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
676  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
677  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
678 
679  detail::cuda_arg<value_type>(proxy.rhs()),
680  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
681  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
682  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
683 
684  op_type
685  );
686  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
687  }
688 }
689 
690 //
692 //
693 
694 // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
695 // we could not find a more 'automatic' way of generating the overloads below...
696 
697 // abs
698 template<typename NumericT>
701 {
702  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
703 
704  typedef NumericT value_type;
705 
706  if (A.row_major())
707  {
708  matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
709  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
710  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
711  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
712  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
713 
714  detail::cuda_arg<value_type>(proxy.lhs()),
715  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
716  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
717  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
718  );
719  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
720  }
721  else
722  {
723  matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
724  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
725  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
726  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
727  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
728 
729  detail::cuda_arg<value_type>(proxy.lhs()),
730  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
731  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
732  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
733  );
734  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
735  }
736 }
737 
738 
739 // acos
740 template<typename NumericT>
743 {
744  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
745 
746  typedef NumericT value_type;
747 
748  if (A.row_major())
749  {
750  matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
751  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
752  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
753  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
754  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
755 
756  detail::cuda_arg<value_type>(proxy.lhs()),
757  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
758  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
759  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
760  );
761  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
762  }
763  else
764  {
765  matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
766  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
767  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
768  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
769  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
770 
771  detail::cuda_arg<value_type>(proxy.lhs()),
772  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
773  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
774  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
775  );
776  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
777  }
778 }
779 
780 
781 // asin
782 template<typename NumericT>
785 {
786  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
787 
788  typedef NumericT value_type;
789 
790  if (A.row_major())
791  {
792  matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
793  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
794  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
795  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
796  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
797 
798  detail::cuda_arg<value_type>(proxy.lhs()),
799  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
800  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
801  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
802  );
803  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
804  }
805  else
806  {
807  matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
808  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
809  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
810  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
811  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
812 
813  detail::cuda_arg<value_type>(proxy.lhs()),
814  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
815  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
816  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
817  );
818  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
819  }
820 }
821 
822 
823 // atan
824 template<typename NumericT>
827 {
828  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
829 
830  typedef NumericT value_type;
831 
832  if (A.row_major())
833  {
834  matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
835  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
836  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
837  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
838  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
839 
840  detail::cuda_arg<value_type>(proxy.lhs()),
841  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
842  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
843  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
844  );
845  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
846  }
847  else
848  {
849  matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
850  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
851  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
852  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
853  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
854 
855  detail::cuda_arg<value_type>(proxy.lhs()),
856  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
857  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
858  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
859  );
860  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
861  }
862 }
863 
864 
865 // ceil
866 template<typename NumericT>
869 {
870  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
871 
872  typedef NumericT value_type;
873 
874  if (A.row_major())
875  {
876  matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
877  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
878  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
879  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
880  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
881 
882  detail::cuda_arg<value_type>(proxy.lhs()),
883  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
884  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
885  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
886  );
887  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
888  }
889  else
890  {
891  matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
892  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
893  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
894  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
895  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
896 
897  detail::cuda_arg<value_type>(proxy.lhs()),
898  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
899  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
900  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
901  );
902  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
903  }
904 }
905 
906 
907 // cos
908 template<typename NumericT>
911 {
912  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
913 
914  typedef NumericT value_type;
915 
916  if (A.row_major())
917  {
918  matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
919  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
920  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
921  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
922  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
923 
924  detail::cuda_arg<value_type>(proxy.lhs()),
925  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
926  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
927  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
928  );
929  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
930  }
931  else
932  {
933  matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
934  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
935  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
936  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
937  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
938 
939  detail::cuda_arg<value_type>(proxy.lhs()),
940  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
941  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
942  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
943  );
944  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
945  }
946 }
947 
948 
949 // cosh
950 template<typename NumericT>
953 {
954  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
955 
956  typedef NumericT value_type;
957 
958  if (A.row_major())
959  {
960  matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
961  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
962  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
963  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
964  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
965 
966  detail::cuda_arg<value_type>(proxy.lhs()),
967  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
968  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
969  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
970  );
971  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
972  }
973  else
974  {
975  matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
976  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
977  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
978  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
979  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
980 
981  detail::cuda_arg<value_type>(proxy.lhs()),
982  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
983  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
984  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
985  );
986  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
987  }
988 }
989 
990 
991 // exp
992 template<typename NumericT>
995 {
996  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
997 
998  typedef NumericT value_type;
999 
1000  if (A.row_major())
1001  {
1002  matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1003  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1004  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1005  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1006  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1007 
1008  detail::cuda_arg<value_type>(proxy.lhs()),
1009  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1010  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1011  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1012  );
1013  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
1014  }
1015  else
1016  {
1017  matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1018  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1019  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1020  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1021  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1022 
1023  detail::cuda_arg<value_type>(proxy.lhs()),
1024  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1025  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1026  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1027  );
1028  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
1029  }
1030 }
1031 
1032 
1033 // fabs
1034 template<typename NumericT>
1037 {
1038  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1039 
1040  typedef NumericT value_type;
1041 
1042  if (A.row_major())
1043  {
1044  matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1045  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1046  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1047  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1048  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1049 
1050  detail::cuda_arg<value_type>(proxy.lhs()),
1051  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1052  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1053  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1054  );
1055  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
1056  }
1057  else
1058  {
1059  matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1060  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1061  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1062  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1063  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1064 
1065  detail::cuda_arg<value_type>(proxy.lhs()),
1066  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1067  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1068  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1069  );
1070  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
1071  }
1072 }
1073 
1074 
1075 // floor
1076 template<typename NumericT>
1079 {
1080  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1081 
1082  typedef NumericT value_type;
1083 
1084  if (A.row_major())
1085  {
1086  matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1087  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1088  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1089  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1090  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1091 
1092  detail::cuda_arg<value_type>(proxy.lhs()),
1093  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1094  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1095  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1096  );
1097  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
1098  }
1099  else
1100  {
1101  matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1102  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1103  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1104  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1105  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1106 
1107  detail::cuda_arg<value_type>(proxy.lhs()),
1108  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1109  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1110  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1111  );
1112  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
1113  }
1114 }
1115 
1116 
1117 // log
1118 template<typename NumericT>
1121 {
1122  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1123 
1124  typedef NumericT value_type;
1125 
1126  if (A.row_major())
1127  {
1128  matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1129  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1130  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1131  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1132  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1133 
1134  detail::cuda_arg<value_type>(proxy.lhs()),
1135  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1136  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1137  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1138  );
1139  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
1140  }
1141  else
1142  {
1143  matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1144  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1145  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1146  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1147  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1148 
1149  detail::cuda_arg<value_type>(proxy.lhs()),
1150  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1151  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1152  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1153  );
1154  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
1155  }
1156 }
1157 
1158 
1159 // log10
1160 template<typename NumericT>
1163 {
1164  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1165 
1166  typedef NumericT value_type;
1167 
1168  if (A.row_major())
1169  {
1170  matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1171  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1172  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1173  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1174  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1175 
1176  detail::cuda_arg<value_type>(proxy.lhs()),
1177  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1178  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1179  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1180  );
1181  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
1182  }
1183  else
1184  {
1185  matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1186  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1187  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1188  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1189  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1190 
1191  detail::cuda_arg<value_type>(proxy.lhs()),
1192  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1193  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1194  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1195  );
1196  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
1197  }
1198 }
1199 
1200 
1201 // sin
1202 template<typename NumericT>
1205 {
1206  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1207 
1208  typedef NumericT value_type;
1209 
1210  if (A.row_major())
1211  {
1212  matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1213  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1214  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1215  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1216  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1217 
1218  detail::cuda_arg<value_type>(proxy.lhs()),
1219  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1220  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1221  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1222  );
1223  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
1224  }
1225  else
1226  {
1227  matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1228  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1229  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1230  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1231  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1232 
1233  detail::cuda_arg<value_type>(proxy.lhs()),
1234  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1235  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1236  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1237  );
1238  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
1239  }
1240 }
1241 
1242 
1243 // sinh
1244 template<typename NumericT>
1247 {
1248  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1249 
1250  typedef NumericT value_type;
1251 
1252  if (A.row_major())
1253  {
1254  matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1255  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1256  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1257  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1258  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1259 
1260  detail::cuda_arg<value_type>(proxy.lhs()),
1261  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1262  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1263  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1264  );
1265  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
1266  }
1267  else
1268  {
1269  matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1270  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1271  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1272  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1273  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1274 
1275  detail::cuda_arg<value_type>(proxy.lhs()),
1276  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1277  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1278  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1279  );
1280  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
1281  }
1282 }
1283 
1284 
1285 // sqrt
1286 template<typename NumericT>
1289 {
1290  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1291 
1292  typedef NumericT value_type;
1293 
1294  if (A.row_major())
1295  {
1296  matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1297  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1298  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1299  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1300  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1301 
1302  detail::cuda_arg<value_type>(proxy.lhs()),
1303  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1304  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1305  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1306  );
1307  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel");
1308  }
1309  else
1310  {
1311  matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1312  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1313  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1314  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1315  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1316 
1317  detail::cuda_arg<value_type>(proxy.lhs()),
1318  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1319  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1320  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1321  );
1322  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel");
1323  }
1324 }
1325 
1326 
1327 // tan
1328 template<typename NumericT>
1331 {
1332  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1333 
1334  typedef NumericT value_type;
1335 
1336  if (A.row_major())
1337  {
1338  matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1339  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1340  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1341  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1342  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1343 
1344  detail::cuda_arg<value_type>(proxy.lhs()),
1345  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1346  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1347  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1348  );
1349  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel");
1350  }
1351  else
1352  {
1353  matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1354  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1355  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1356  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1357  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1358 
1359  detail::cuda_arg<value_type>(proxy.lhs()),
1360  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1361  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1362  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1363  );
1364  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel");
1365  }
1366 }
1367 
1368 
1369 // tanh
1370 template<typename NumericT>
1373 {
1374  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1375 
1376  typedef NumericT value_type;
1377 
1378  if (A.row_major())
1379  {
1380  matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1381  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1382  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1383  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1384  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1385 
1386  detail::cuda_arg<value_type>(proxy.lhs()),
1387  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1388  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1389  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1390  );
1391  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel");
1392  }
1393  else
1394  {
1395  matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1396  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1397  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1398  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1399  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1400 
1401  detail::cuda_arg<value_type>(proxy.lhs()),
1402  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1403  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1404  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1405  );
1406  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel");
1407  }
1408 }
1409 
1410 
1411 //
1413 //
1414 
1415 // A * x
1416 
1426 template<typename NumericT>
1427 void prod_impl(const matrix_base<NumericT> & mat, bool mat_transpose,
1428  const vector_base<NumericT> & vec,
1429  vector_base<NumericT> & result)
1430 {
1431  typedef NumericT value_type;
1432 
1433  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
1434 
1435  if (mat.row_major())
1436  {
1437  if (!mat_transpose)
1438  {
1439  vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1440  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1441  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1442  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1443  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1444 
1445  detail::cuda_arg<value_type>(vec),
1446  static_cast<unsigned int>(viennacl::traits::start(vec)),
1447  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1448  static_cast<unsigned int>(viennacl::traits::size(vec)),
1449 
1450  detail::cuda_arg<value_type>(result),
1451  static_cast<unsigned int>(viennacl::traits::start(result)),
1452  static_cast<unsigned int>(viennacl::traits::stride(result)),
1453  static_cast<unsigned int>(viennacl::traits::size(result))
1454  );
1455  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel");
1456  }
1457  else
1458  {
1459  trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1460  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1461  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1462  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1463  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1464 
1465  detail::cuda_arg<value_type>(vec),
1466  static_cast<unsigned int>(viennacl::traits::start(vec)),
1467  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1468  static_cast<unsigned int>(viennacl::traits::size(vec)),
1469 
1470  detail::cuda_arg<value_type>(result),
1471  static_cast<unsigned int>(viennacl::traits::start(result)),
1472  static_cast<unsigned int>(viennacl::traits::stride(result)),
1473  static_cast<unsigned int>(viennacl::traits::size(result))
1474  );
1475  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel");
1476  }
1477  }
1478  else
1479  {
1480  if (!mat_transpose)
1481  {
1482  vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1483  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1484  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1485  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1486  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1487 
1488  detail::cuda_arg<value_type>(vec),
1489  static_cast<unsigned int>(viennacl::traits::start(vec)),
1490  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1491  static_cast<unsigned int>(viennacl::traits::size(vec)),
1492 
1493  detail::cuda_arg<value_type>(result),
1494  static_cast<unsigned int>(viennacl::traits::start(result)),
1495  static_cast<unsigned int>(viennacl::traits::stride(result)),
1496  static_cast<unsigned int>(viennacl::traits::size(result))
1497  );
1498  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel");
1499  }
1500  else
1501  {
1502  trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1503  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1504  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1505  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1506  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1507 
1508  detail::cuda_arg<value_type>(vec),
1509  static_cast<unsigned int>(viennacl::traits::start(vec)),
1510  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1511  static_cast<unsigned int>(viennacl::traits::size(vec)),
1512 
1513  detail::cuda_arg<value_type>(result),
1514  static_cast<unsigned int>(viennacl::traits::start(result)),
1515  static_cast<unsigned int>(viennacl::traits::stride(result)),
1516  static_cast<unsigned int>(viennacl::traits::size(result))
1517  );
1518  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel");
1519  }
1520  }
1521 }
1522 
1523 
1524 //
1526 //
1527 
1528 namespace detail
1529 {
1530  // C = A * B and possibly transposed variants
1531  template<typename MatrixT1, typename MatrixT2, typename MatrixT3, typename ScalarT>
1532  void prod_slow_kernel(const MatrixT1 & A, bool transposed_A,
1533  const MatrixT2 & B, bool transposed_B,
1534  MatrixT3 & C,
1535  ScalarT alpha,
1536  ScalarT beta)
1537  {
1539 
1540  cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha);
1541  cpu_value_type converted_beta = static_cast<cpu_value_type>(beta);
1542 
1543  dim3 threads(16, 16);
1544  dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1,
1545  (viennacl::traits::size2(C) - 1) / 16 + 1);
1546 
1547  bool row_major_A = A.row_major();
1548  bool row_major_B = B.row_major();
1549  bool row_major_C = C.row_major();
1550 
1551 
1552  if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1553  {
1554  matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
1555  (converted_alpha,
1556  detail::cuda_arg<cpu_value_type>(A),
1557  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1558  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1559  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1560  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1561 
1562  detail::cuda_arg<cpu_value_type>(B),
1563  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1564  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1565  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1566  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1567 
1568  converted_beta,
1569  detail::cuda_arg<cpu_value_type>(C),
1570  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1571  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1572  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1573  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1574  }
1575  else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1576  {
1577  matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
1578  (converted_alpha,
1579  detail::cuda_arg<cpu_value_type>(A),
1580  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1581  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1582  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1583  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1584 
1585  detail::cuda_arg<cpu_value_type>(B),
1586  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1587  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1588  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1589  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1590 
1591  converted_beta,
1592  detail::cuda_arg<cpu_value_type>(C),
1593  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1594  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1595  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1596  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1597  }
1598  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1599  {
1600  matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
1601  (converted_alpha,
1602  detail::cuda_arg<cpu_value_type>(A),
1603  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1604  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1605  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1606  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1607 
1608  detail::cuda_arg<cpu_value_type>(B),
1609  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1610  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1611  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1612  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1613 
1614  converted_beta,
1615  detail::cuda_arg<cpu_value_type>(C),
1616  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1617  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1618  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1619  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1620  }
1621  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1622  {
1623  matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
1624  (converted_alpha,
1625  detail::cuda_arg<cpu_value_type>(A),
1626  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1627  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1628  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1629  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1630 
1631  detail::cuda_arg<cpu_value_type>(B),
1632  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1633  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1634  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1635  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1636 
1637  converted_beta,
1638  detail::cuda_arg<cpu_value_type>(C),
1639  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1640  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1641  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1642  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1643  }
1645 
1646  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1647  {
1648  matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
1649  (converted_alpha,
1650  detail::cuda_arg<cpu_value_type>(A),
1651  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1652  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1653  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1654  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1655 
1656  detail::cuda_arg<cpu_value_type>(B),
1657  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1658  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1659  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1660  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1661 
1662  converted_beta,
1663  detail::cuda_arg<cpu_value_type>(C),
1664  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1665  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1666  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1667  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1668  }
1669  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
1670  {
1671  matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
1672  (converted_alpha,
1673  detail::cuda_arg<cpu_value_type>(A),
1674  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1675  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1676  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1677  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1678 
1679  detail::cuda_arg<cpu_value_type>(B),
1680  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1681  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1682  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1683  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1684 
1685  converted_beta,
1686  detail::cuda_arg<cpu_value_type>(C),
1687  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1688  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1689  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1690  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1691  }
1692  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
1693  {
1694  matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
1695  (converted_alpha,
1696  detail::cuda_arg<cpu_value_type>(A),
1697  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1698  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1699  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1700  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1701 
1702  detail::cuda_arg<cpu_value_type>(B),
1703  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1704  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1705  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1706  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1707 
1708  converted_beta,
1709  detail::cuda_arg<cpu_value_type>(C),
1710  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1711  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1712  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1713  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1714  }
1715  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
1716  {
1717  matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
1718  (converted_alpha,
1719  detail::cuda_arg<cpu_value_type>(A),
1720  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1721  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1722  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1723  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1724 
1725  detail::cuda_arg<cpu_value_type>(B),
1726  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1727  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1728  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1729  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1730 
1731  converted_beta,
1732  detail::cuda_arg<cpu_value_type>(C),
1733  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1734  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1735  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1736  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1737  }
1739 
1740  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
1741  {
1742  matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
1743  (converted_alpha,
1744  detail::cuda_arg<cpu_value_type>(A),
1745  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1746  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1747  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1748  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1749 
1750  detail::cuda_arg<cpu_value_type>(B),
1751  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1752  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1753  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1754  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1755 
1756  converted_beta,
1757  detail::cuda_arg<cpu_value_type>(C),
1758  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1759  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1760  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1761  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1762  }
1763  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
1764  {
1765  matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
1766  (converted_alpha,
1767  detail::cuda_arg<cpu_value_type>(A),
1768  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1769  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1770  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1771  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1772 
1773  detail::cuda_arg<cpu_value_type>(B),
1774  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1775  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1776  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1777  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1778 
1779  converted_beta,
1780  detail::cuda_arg<cpu_value_type>(C),
1781  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1782  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1783  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1784  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1785  }
1786  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
1787  {
1788  matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
1789  (converted_alpha,
1790  detail::cuda_arg<cpu_value_type>(A),
1791  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1792  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1793  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1794  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1795 
1796  detail::cuda_arg<cpu_value_type>(B),
1797  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1798  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1799  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1800  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1801 
1802  converted_beta,
1803  detail::cuda_arg<cpu_value_type>(C),
1804  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1805  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1806  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1807  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1808  }
1809  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
1810  {
1811  matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
1812  (converted_alpha,
1813  detail::cuda_arg<cpu_value_type>(A),
1814  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1815  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1816  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1817  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1818 
1819  detail::cuda_arg<cpu_value_type>(B),
1820  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1821  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1822  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1823  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1824 
1825  converted_beta,
1826  detail::cuda_arg<cpu_value_type>(C),
1827  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1828  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1829  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1830  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1831  }
1833 
1834  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
1835  {
1836  matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
1837  (converted_alpha,
1838  detail::cuda_arg<cpu_value_type>(A),
1839  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1840  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1841  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1842  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1843 
1844  detail::cuda_arg<cpu_value_type>(B),
1845  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1846  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1847  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1848  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1849 
1850  converted_beta,
1851  detail::cuda_arg<cpu_value_type>(C),
1852  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1853  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1854  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1855  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1856  }
1857  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
1858  {
1859  matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
1860  (converted_alpha,
1861  detail::cuda_arg<cpu_value_type>(A),
1862  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1863  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1864  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1865  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1866 
1867  detail::cuda_arg<cpu_value_type>(B),
1868  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1869  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1870  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1871  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1872 
1873  converted_beta,
1874  detail::cuda_arg<cpu_value_type>(C),
1875  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1876  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1877  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1878  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1879  }
1880  else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
1881  {
1882  matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
1883  (converted_alpha,
1884  detail::cuda_arg<cpu_value_type>(A),
1885  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1886  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1887  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1888  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1889 
1890  detail::cuda_arg<cpu_value_type>(B),
1891  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1892  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1893  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1894  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1895 
1896  converted_beta,
1897  detail::cuda_arg<cpu_value_type>(C),
1898  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1899  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1900  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1901  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1902  }
1903  else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
1904  {
1905  matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
1906  (converted_alpha,
1907  detail::cuda_arg<cpu_value_type>(A),
1908  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1909  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1910  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1911  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1912 
1913  detail::cuda_arg<cpu_value_type>(B),
1914  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1915  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1916  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1917  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1918 
1919  converted_beta,
1920  detail::cuda_arg<cpu_value_type>(C),
1921  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1922  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1923  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1924  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1925  }
1927 
1928  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1929  {
1930  matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
1931  (converted_alpha,
1932  detail::cuda_arg<cpu_value_type>(A),
1933  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1934  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1935  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1936  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1937 
1938  detail::cuda_arg<cpu_value_type>(B),
1939  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1940  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1941  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1942  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1943 
1944  converted_beta,
1945  detail::cuda_arg<cpu_value_type>(C),
1946  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1947  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1948  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1949  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1950  }
1951  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1952  {
1953  matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
1954  (converted_alpha,
1955  detail::cuda_arg<cpu_value_type>(A),
1956  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1957  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1958  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1959  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1960 
1961  detail::cuda_arg<cpu_value_type>(B),
1962  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1963  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1964  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1965  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1966 
1967  converted_beta,
1968  detail::cuda_arg<cpu_value_type>(C),
1969  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1970  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1971  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1972  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1973  }
1974  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1975  {
1976  matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
1977  (converted_alpha,
1978  detail::cuda_arg<cpu_value_type>(A),
1979  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1980  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1981  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1982  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1983 
1984  detail::cuda_arg<cpu_value_type>(B),
1985  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1986  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1987  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1988  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1989 
1990  converted_beta,
1991  detail::cuda_arg<cpu_value_type>(C),
1992  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1993  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1994  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1995  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1996  }
1997  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1998  {
1999  matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
2000  (converted_alpha,
2001  detail::cuda_arg<cpu_value_type>(A),
2002  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2003  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2004  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2005  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2006 
2007  detail::cuda_arg<cpu_value_type>(B),
2008  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2009  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2010  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2011  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2012 
2013  converted_beta,
2014  detail::cuda_arg<cpu_value_type>(C),
2015  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2016  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2017  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2018  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2019  }
2021 
2022  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
2023  {
2024  matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
2025  (converted_alpha,
2026  detail::cuda_arg<cpu_value_type>(A),
2027  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2028  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2029  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2030  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2031 
2032  detail::cuda_arg<cpu_value_type>(B),
2033  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2034  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2035  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2036  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2037 
2038  converted_beta,
2039  detail::cuda_arg<cpu_value_type>(C),
2040  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2041  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2042  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2043  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2044  }
2045  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
2046  {
2047  matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
2048  (converted_alpha,
2049  detail::cuda_arg<cpu_value_type>(A),
2050  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2051  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2052  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2053  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2054 
2055  detail::cuda_arg<cpu_value_type>(B),
2056  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2057  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2058  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2059  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2060 
2061  converted_beta,
2062  detail::cuda_arg<cpu_value_type>(C),
2063  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2064  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2065  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2066  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2067  }
2068  else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
2069  {
2070  matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
2071  (converted_alpha,
2072  detail::cuda_arg<cpu_value_type>(A),
2073  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2074  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2075  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2076  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2077 
2078  detail::cuda_arg<cpu_value_type>(B),
2079  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2080  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2081  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2082  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2083 
2084  converted_beta,
2085  detail::cuda_arg<cpu_value_type>(C),
2086  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2087  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2088  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2089  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2090  }
2091  else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
2092  {
2093  matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
2094  (converted_alpha,
2095  detail::cuda_arg<cpu_value_type>(A),
2096  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2097  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2098  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2099  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2100 
2101  detail::cuda_arg<cpu_value_type>(B),
2102  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2103  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2104  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2105  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2106 
2107  converted_beta,
2108  detail::cuda_arg<cpu_value_type>(C),
2109  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2110  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2111  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2112  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2113  }
2115 
2116  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
2117  {
2118  matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
2119  (converted_alpha,
2120  detail::cuda_arg<cpu_value_type>(A),
2121  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2122  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2123  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2124  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2125 
2126  detail::cuda_arg<cpu_value_type>(B),
2127  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2128  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2129  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2130  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2131 
2132  converted_beta,
2133  detail::cuda_arg<cpu_value_type>(C),
2134  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2135  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2136  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2137  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2138  }
2139  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
2140  {
2141  matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
2142  (converted_alpha,
2143  detail::cuda_arg<cpu_value_type>(A),
2144  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2145  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2146  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2147  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2148 
2149  detail::cuda_arg<cpu_value_type>(B),
2150  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2151  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2152  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2153  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2154 
2155  converted_beta,
2156  detail::cuda_arg<cpu_value_type>(C),
2157  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2158  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2159  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2160  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2161  }
2162  else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
2163  {
2164  matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
2165  (converted_alpha,
2166  detail::cuda_arg<cpu_value_type>(A),
2167  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2168  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2169  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2170  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2171 
2172  detail::cuda_arg<cpu_value_type>(B),
2173  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2174  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2175  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2176  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2177 
2178  converted_beta,
2179  detail::cuda_arg<cpu_value_type>(C),
2180  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2181  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2182  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2183  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2184  }
2185  else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
2186  {
2187  matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
2188  (converted_alpha,
2189  detail::cuda_arg<cpu_value_type>(A),
2190  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2191  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2192  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2193  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2194 
2195  detail::cuda_arg<cpu_value_type>(B),
2196  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2197  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2198  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2199  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2200 
2201  converted_beta,
2202  detail::cuda_arg<cpu_value_type>(C),
2203  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2204  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2205  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2206  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2207  }
2208 
2209 
2211 
2212  else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
2213  {
2214  matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
2215  (converted_alpha,
2216  detail::cuda_arg<cpu_value_type>(A),
2217  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2218  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2219  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2220  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2221 
2222  detail::cuda_arg<cpu_value_type>(B),
2223  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2224  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2225  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2226  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2227 
2228  converted_beta,
2229  detail::cuda_arg<cpu_value_type>(C),
2230  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2231  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2232  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2233  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2234  }
2235  else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
2236  {
2237  matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
2238  (converted_alpha,
2239  detail::cuda_arg<cpu_value_type>(A),
2240  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2241  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2242  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2243  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2244 
2245  detail::cuda_arg<cpu_value_type>(B),
2246  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2247  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2248  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2249  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2250 
2251  converted_beta,
2252  detail::cuda_arg<cpu_value_type>(C),
2253  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2254  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2255  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2256  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2257  }
2258  else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
2259  {
2260  matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
2261  (converted_alpha,
2262  detail::cuda_arg<cpu_value_type>(A),
2263  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2264  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2265  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2266  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2267 
2268  detail::cuda_arg<cpu_value_type>(B),
2269  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2270  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2271  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2272  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2273 
2274  converted_beta,
2275  detail::cuda_arg<cpu_value_type>(C),
2276  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2277  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2278  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2279  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2280  }
2281  else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
2282  {
2283  matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
2284  (converted_alpha,
2285  detail::cuda_arg<cpu_value_type>(A),
2286  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2287  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2288  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2289  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2290 
2291  detail::cuda_arg<cpu_value_type>(B),
2292  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2293  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2294  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2295  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2296 
2297  converted_beta,
2298  detail::cuda_arg<cpu_value_type>(C),
2299  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2300  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2301  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2302  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2303  }
2304 
2305  }
2306 
2307  // C = A * B, using fast kernel
2308  template<typename MatrixT1, typename MatrixT2, typename MatrixT3, typename ScalarT>
2309  void prod_fast_kernel(const MatrixT1 & A,
2310  const MatrixT2 & B,
2311  MatrixT3 & C,
2312  ScalarT alpha,
2313  ScalarT beta,
2314  std::string kernel_name)
2315  {
2317 
2318  cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
2319  cpu_value_type cl_beta = static_cast<cpu_value_type>(beta);
2320 
2321  /*viennacl::ocl::enqueue(k(cl_alpha,
2322  viennacl::traits::opencl_handle(A),
2323  cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
2324  cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
2325  cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
2326  cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
2327 
2328  viennacl::traits::opencl_handle(B),
2329  cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
2330  cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
2331  cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
2332  cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)),
2333 
2334  cl_beta,
2335  viennacl::traits::opencl_handle(C),
2336  cl_uint(viennacl::traits::start1(C)), cl_uint(viennacl::traits::start2(C)),
2337  cl_uint(viennacl::traits::stride1(C)), cl_uint(viennacl::traits::stride2(C)),
2338  cl_uint(viennacl::traits::size1(C)), cl_uint(viennacl::traits::size2(C)),
2339  cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C))
2340  )
2341  );*/
2342 
2343  throw "not implemented yet";
2344  }
2345 
2346  template<typename MatrixT1, typename MatrixT2, typename MatrixT3, typename ScalarT>
2347  void prod(const MatrixT1 & A, bool transposed_A,
2348  const MatrixT2 & B, bool transposed_B,
2349  MatrixT3 & C,
2350  ScalarT alpha,
2351  ScalarT beta)
2352  {
2353  if ( (viennacl::traits::size1(A) < 64)
2354  || (viennacl::traits::size2(A) < 64)
2355  || (viennacl::traits::size1(B) < 64) ) //there is most likely not enough to compute, rendering kernel launch overhead considerable
2356  {
2357  prod_slow_kernel(A, transposed_A,
2358  B, transposed_B,
2359  C, alpha, beta);
2360  }
2361  /*else if ( (viennacl::traits::size1(A) % 64 == 0)
2362  && (viennacl::traits::size2(A) % 64 == 0)
2363  && (viennacl::traits::size1(B) % 64 == 0) ) // allows the use of the fast kernel only
2364  {
2365  prod_fast_kernel(A, B, C, alpha, beta);
2366  //prod_slow_kernel(A, B, C, slow_kernel_name);
2367  }*/
2368  else //TODO: use four kernels
2369  {
2370  prod_slow_kernel(A, transposed_A,
2371  B, transposed_B,
2372  C, alpha, beta);
2373  }
2374 
2375  }
2376 } // namespace detail
2377 
2378 
2384 template<typename NumericT, typename ScalarT>
2385 void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
2386  const matrix_base<NumericT> & B, bool trans_B,
2388  ScalarT alpha,
2389  ScalarT beta)
2390 {
2391  detail::prod(A, trans_A,
2392  B, trans_B,
2393  C, alpha, beta);
2394 }
2395 
2396 
2397 
2398 
2399 //
2401 //
2402 
2403 
2416 template<typename NumericT, typename ScalarT>
2418  ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
2419  const vector_base<NumericT> & vec1,
2420  const vector_base<NumericT> & vec2)
2421 {
2422  assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
2423  assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
2424 
2425  typedef NumericT value_type;
2426 
2427  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
2428 
2429  value_type temporary_alpha = 0;
2431  temporary_alpha = alpha;
2432 
2433  if (mat1.row_major())
2434  {
2435  scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2436  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2437  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2438  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2439  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2440 
2441  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2442  options_alpha,
2443 
2444  detail::cuda_arg<value_type>(vec1),
2445  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2446  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2447  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2448 
2449  detail::cuda_arg<value_type>(vec2),
2450  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2451  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2452  static_cast<unsigned int>(viennacl::traits::size(vec2))
2453  );
2454  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel");
2455  }
2456  else
2457  {
2458  scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2459  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2460  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2461  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2462  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2463 
2464  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2465  options_alpha,
2466 
2467  detail::cuda_arg<value_type>(vec1),
2468  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2469  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2470  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2471 
2472  detail::cuda_arg<value_type>(vec2),
2473  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2474  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2475  static_cast<unsigned int>(viennacl::traits::size(vec2))
2476  );
2477  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel");
2478  }
2479 }
2480 
2481 
2489 template <typename NumericT, typename VectorType>
2491  VectorType & dh,
2492  VectorType & sh
2493  )
2494 {
2495  if (A.row_major())
2496  {
2497  viennacl::linalg::cuda::bidiag_pack_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2498  viennacl::linalg::cuda::detail::cuda_arg<NumericT>(dh),
2499  viennacl::linalg::cuda::detail::cuda_arg<NumericT>(sh),
2500  static_cast<unsigned int>(viennacl::traits::size1(A)),
2501  static_cast<unsigned int>(viennacl::traits::size2(A)),
2502  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2503  }
2504  else
2505  {
2506  viennacl::linalg::cuda::bidiag_pack_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2507  viennacl::linalg::cuda::detail::cuda_arg<NumericT>(dh),
2508  viennacl::linalg::cuda::detail::cuda_arg<NumericT>(sh),
2509  static_cast<unsigned int>(viennacl::traits::size1(A)),
2510  static_cast<unsigned int>(viennacl::traits::size2(A)),
2511  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2512  }
2513 }
2514 
2515 
2516 
2526 template <typename NumericT>
2529  vcl_size_t row_start,
2530  vcl_size_t col_start,
2531  bool copy_col
2532 )
2533 {
2534  if(copy_col)
2535  {
2536  if (A.row_major())
2537  {
2538  copy_col_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2539  detail::cuda_arg<NumericT>(V),
2540  static_cast<unsigned int>(row_start),
2541  static_cast<unsigned int>(col_start),
2542  static_cast<unsigned int>(viennacl::traits::size1(A)),
2543  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2544  }
2545  else
2546  {
2547  copy_col_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2548  detail::cuda_arg<NumericT>(V),
2549  static_cast<unsigned int>(row_start),
2550  static_cast<unsigned int>(col_start),
2551  static_cast<unsigned int>(viennacl::traits::size1(A)),
2552  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2553  }
2554 
2555 
2556  }
2557  else
2558  {
2559  if (A.row_major())
2560  {
2561  copy_row_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2562  detail::cuda_arg<NumericT>(V),
2563  static_cast<unsigned int>(row_start),
2564  static_cast<unsigned int>(col_start),
2565  static_cast<unsigned int>(viennacl::traits::size2(A)),
2566  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2567  }
2568  else
2569  {
2570  copy_row_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2571  detail::cuda_arg<NumericT>(V),
2572  static_cast<unsigned int>(row_start),
2573  static_cast<unsigned int>(col_start),
2574  static_cast<unsigned int>(viennacl::traits::size2(A)),
2575  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2576  }
2577  }
2578 }
2579 
2580 
2587 template <typename NumericT>
2590  vcl_size_t start)
2591 {
2592  if (A.row_major())
2593  {
2594  house_update_A_left_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2595  detail::cuda_arg<NumericT>(D),
2596  static_cast<unsigned int>(start + 1),
2597  static_cast<unsigned int>(start),
2598  static_cast<unsigned int>(viennacl::traits::size1(A)),
2599  static_cast<unsigned int>(viennacl::traits::size2(A)),
2600  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2601 
2602 
2603  }
2604  else
2605  {
2606  house_update_A_left_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2607  detail::cuda_arg<NumericT>(D),
2608  static_cast<unsigned int>(start + 1),
2609  static_cast<unsigned int>(start),
2610  static_cast<unsigned int>(viennacl::traits::size1(A)),
2611  static_cast<unsigned int>(viennacl::traits::size2(A)),
2612  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2613 
2614 
2615  }
2616 
2617 }
2618 
2619 
2626 template <typename NumericT>
2629 {
2630  if (A.row_major())
2631  {
2632  house_update_A_right_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2633  detail::cuda_arg<NumericT>(D),
2634  static_cast<unsigned int>(0),
2635  static_cast<unsigned int>(0),
2636  static_cast<unsigned int>(viennacl::traits::size1(A)),
2637  static_cast<unsigned int>(viennacl::traits::size2(A)),
2638  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2639 
2640 
2641  }
2642  else
2643  {
2644  house_update_A_right_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2645  detail::cuda_arg<NumericT>(D),
2646  static_cast<unsigned int>(0),
2647  static_cast<unsigned int>(0),
2648  static_cast<unsigned int>(viennacl::traits::size1(A)),
2649  static_cast<unsigned int>(viennacl::traits::size2(A)),
2650  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2651 
2652  }
2653 
2654 }
2655 
2656 
2663 template <typename NumericT>
2666  vcl_size_t A_size1)
2667 
2668 {
2669  if (Q.row_major())
2670  {
2671  house_update_QL_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2672  detail::cuda_arg<NumericT>(D),
2673  static_cast<unsigned int>(A_size1),
2674  static_cast<unsigned int>(viennacl::traits::internal_size2(Q)));
2675  }
2676  else
2677  {
2678  house_update_QL_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2679  detail::cuda_arg<NumericT>(D),
2680  static_cast<unsigned int>(A_size1),
2681  static_cast<unsigned int>(viennacl::traits::internal_size1(Q)));
2682  }
2683 }
2684 
2694 template<typename NumericT>
2696  vector_base<NumericT>& tmp1,
2697  vector_base<NumericT>& tmp2,
2698  int l,
2699  int m)
2700  {
2701  if (Q.row_major())
2702  givens_next_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2703  detail::cuda_arg<NumericT>(tmp1),
2704  detail::cuda_arg<NumericT>(tmp2),
2705  static_cast<unsigned int>(viennacl::traits::size1(Q)),
2706  static_cast<unsigned int>(viennacl::traits::internal_size2(Q)),
2707  static_cast<unsigned int>(l),
2708  static_cast<unsigned int>(m - 1));
2709 
2710  else
2711  givens_next_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2712  detail::cuda_arg<NumericT>(tmp1),
2713  detail::cuda_arg<NumericT>(tmp2),
2714  static_cast<unsigned int>(viennacl::traits::size1(Q)),
2715  static_cast<unsigned int>(viennacl::traits::internal_size1(Q)),
2716  static_cast<unsigned int>(l),
2717  static_cast<unsigned int>(m - 1));
2718  }
2719 
2720 
2721 #define VIENNACL_SECTION_SIZE 512
2722 
2729 template<typename NumericT>
2731  vector_base<NumericT>& vec2)
2732 {
2733  vcl_size_t N = static_cast<vcl_size_t>(std::ceil(vec1.size() / static_cast<double>(VIENNACL_SECTION_SIZE)));
2735  viennacl::vector<NumericT> S_ref(N);
2736 
2738  detail::cuda_arg<NumericT>(vec1),
2739  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2740  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2741  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2742 
2743  detail::cuda_arg<NumericT>(vec2),
2744  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2745  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2746 
2747  detail::cuda_arg<NumericT>(S),
2748  static_cast<unsigned int>(viennacl::traits::start(S)),
2749  static_cast<unsigned int>(viennacl::traits::stride(S)));
2750 
2751  scan_kernel_2<<<std::ceil(S.size()/static_cast<double>(VIENNACL_SECTION_SIZE)), VIENNACL_SECTION_SIZE>>>(
2752  detail::cuda_arg<NumericT>(S_ref),
2753  static_cast<unsigned int>(viennacl::traits::start(S_ref)),
2754  static_cast<unsigned int>(viennacl::traits::stride(S_ref)),
2755 
2756  detail::cuda_arg<NumericT>(S),
2757  static_cast<unsigned int>(viennacl::traits::start(S)),
2758  static_cast<unsigned int>(viennacl::traits::stride(S)),
2759  static_cast<unsigned int>(viennacl::traits::size(S)));
2760 
2761  scan_kernel_3<<<std::ceil(S.size()/static_cast<double>(VIENNACL_SECTION_SIZE)), VIENNACL_SECTION_SIZE>>>(
2762  detail::cuda_arg<NumericT>(S_ref),
2763  static_cast<unsigned int>(viennacl::traits::start(S_ref)),
2764  static_cast<unsigned int>(viennacl::traits::stride(S_ref)),
2765 
2766  detail::cuda_arg<NumericT>(S),
2767  static_cast<unsigned int>(viennacl::traits::start(S)),
2768  static_cast<unsigned int>(viennacl::traits::stride(S)));
2769 
2770  scan_kernel_4<<<S.size(), VIENNACL_SECTION_SIZE>>>(
2771  detail::cuda_arg<NumericT>(S),
2772  static_cast<unsigned int>(viennacl::traits::start(S)),
2773  static_cast<unsigned int>(viennacl::traits::stride(S)),
2774 
2775  detail::cuda_arg<NumericT>(vec2),
2776  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2777  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2778  static_cast<unsigned int>(viennacl::traits::size(vec2)));
2779 }
2780 
2787 template<typename NumericT, typename F>
2790 {
2791  viennacl::vector<NumericT> S (std::ceil(vec1.size() / static_cast<double>(VIENNACL_SECTION_SIZE)));
2792  viennacl::vector<NumericT> S_ref(std::ceil(vec1.size() / static_cast<double>(VIENNACL_SECTION_SIZE)));
2793 
2795  detail::cuda_arg<NumericT>(vec1),
2796  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2797  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2798  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2799 
2800  detail::cuda_arg<NumericT>(vec2),
2801  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2802  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2803 
2804  detail::cuda_arg<NumericT>(S),
2805  static_cast<unsigned int>(viennacl::traits::start(S)),
2806  static_cast<unsigned int>(viennacl::traits::stride(S)));
2807 
2808  scan_kernel_2<<<std::ceil(S.size()/static_cast<float>(VIENNACL_SECTION_SIZE)), VIENNACL_SECTION_SIZE>>>(
2809  detail::cuda_arg<NumericT>(S_ref),
2810  static_cast<unsigned int>(viennacl::traits::start(S_ref)),
2811  static_cast<unsigned int>(viennacl::traits::stride(S_ref)),
2812 
2813  detail::cuda_arg<NumericT>(S),
2814  static_cast<unsigned int>(viennacl::traits::start(S)),
2815  static_cast<unsigned int>(viennacl::traits::stride(S)),
2816  static_cast<unsigned int>(viennacl::traits::size(S)));
2817 
2818  scan_kernel_3<<<std::ceil(S.size()/static_cast<float>(VIENNACL_SECTION_SIZE)), VIENNACL_SECTION_SIZE>>>(
2819  detail::cuda_arg<NumericT>(S_ref),
2820  static_cast<unsigned int>(viennacl::traits::start(S_ref)),
2821  static_cast<unsigned int>(viennacl::traits::stride(S_ref)),
2822 
2823  detail::cuda_arg<NumericT>(S),
2824  static_cast<unsigned int>(viennacl::traits::start(S)),
2825  static_cast<unsigned int>(viennacl::traits::stride(S)));
2826 
2827  scan_kernel_4<<<S.size(), VIENNACL_SECTION_SIZE>>>(
2828  detail::cuda_arg<NumericT>(S),
2829  static_cast<unsigned int>(viennacl::traits::start(S)),
2830  static_cast<unsigned int>(viennacl::traits::stride(S)),
2831 
2832  detail::cuda_arg<NumericT>(vec2),
2833  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2834  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2835  static_cast<unsigned int>(viennacl::traits::size(vec2)));
2836 }
2837 
2838 #undef VIENNACL_SECTION_SIZE
2839 
2840 } // namespace cuda
2841 } //namespace linalg
2842 } //namespace viennacl
2843 
2844 
2845 #endif
void house_update_QL(matrix_base< NumericT > &Q, vector_base< NumericT > &D, vcl_size_t A_size1)
This function updates the matrix Q, which is needed for the computation of the eigenvectors.
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:38
void house_update_A_right(matrix_base< NumericT > &A, vector_base< NumericT > &D)
This function applies a householder transformation to a matrix: A <- A * P with a householder reflect...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
Generic size and resize functionality for different vector and matrix types.
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
__global__ void scan_kernel_2(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS, unsigned int InputSize)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:340
Implementations of row-major dense matrix related operations, including matrix-vector products...
size_type stride2() const
Returns the number of columns.
Definition: matrix_def.hpp:225
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:57
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void ambm(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Determines row and column increments for matrices and matrix proxies.
Implementations of column-major dense matrix related operations, including matrix-vector products...
viennacl::scalar< int > s2
viennacl::scalar< float > s1
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void scan_kernel_4(T *S, unsigned int startS, unsigned int incS, T *Y, unsigned int startY, unsigned int incY, unsigned int OutputSize)
void prod_slow_kernel(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, uint size, uint stride, uint start_i, uint end_i)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
#define VIENNACL_SECTION_SIZE
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:447
Definition: blas3.hpp:36
void prod_fast_kernel(const MatrixT1 &A, const MatrixT2 &B, MatrixT3 &C, ScalarT alpha, ScalarT beta, std::string kernel_name)
void am(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
void matrix_diag_to_vector(matrix_base< NumericT > const &mat, int k, vector_base< NumericT > &vec)
void house_update_A_left(matrix_base< NumericT > &A, vector_base< NumericT > &D, vcl_size_t start)
This function applies a householder transformation to a matrix. A <- P * A with a householder reflect...
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void inclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an inclusive scan.
void scaled_rank_1_update(matrix_base< NumericT > &mat1, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
void ambm_m(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
size_type stride1() const
Returns the number of rows.
Definition: matrix_def.hpp:223
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT > &mat)
void matrix_diagonal_assign(matrix_base< NumericT > &mat, NumericT s)
std::size_t vcl_size_t
Definition: forwards.h:74
Dense matrix-matrix product CUDA kernels reside here.
__global__ void inclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:466
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:238
Proxy classes for vectors.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
__global__ void scan_kernel_3(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_column(const matrix_base< NumericT > &mat, unsigned int j, vector_base< NumericT > &vec)
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution.
void matrix_row(matrix_base< NumericT > const &mat, unsigned int i, vector_base< NumericT > &vec)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:239
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
void bidiag_pack(matrix_base< NumericT > &A, VectorType &dh, VectorType &sh)
This function stores the diagonal and the superdiagonal of a matrix in two vectors.
A tag class representing transposed matrices.
Definition: forwards.h:219
size_type start2() const
Returns the number of columns.
Definition: matrix_def.hpp:221
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:129
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:231
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:229
void givens_next(matrix_base< NumericT > &Q, vector_base< NumericT > &tmp1, vector_base< NumericT > &tmp2, int l, int m)
This function updates the matrix Q. It is part of the tql2 algorithm.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
Definition: handle.hpp:41
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:436
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:133
Implementation of the ViennaCL scalar class.
Implementations of NMF operations using CUDA.
A collection of compile time type deductions.
__global__ void exclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
void matrix_assign(matrix_base< NumericT > &mat, NumericT s, bool clear=false)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
Definition: common.hpp:137
void copy_vec(matrix_base< NumericT > &A, vector_base< NumericT > &V, vcl_size_t row_start, vcl_size_t col_start, bool copy_col)
This function copies a row or a column from a matrix to a vector.
Simple enable-if variant that uses the SFINAE pattern.
size_type start1() const
Returns the number of rows.
Definition: matrix_def.hpp:219
void exclusive_scan(vector_base< NumericT, F > &vec1, vector_base< NumericT, F > &vec2)
This function implements an exclusive scan.