ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
29 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace host_based
45 {
46 
47 //
48 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
49 //
50 
51 template<typename NumericT,
52  typename SizeT, typename DistanceT>
55 {
56  const NumericT * temp_proxy = detail::extract_raw_pointer<NumericT>(proxy.lhs());
57  NumericT * temp = detail::extract_raw_pointer<NumericT>(temp_trans);
58 
59  vcl_size_t proxy_int_size1=proxy.lhs().internal_size1();
60  vcl_size_t proxy_int_size2=proxy.lhs().internal_size2();
61  vcl_size_t temp_int_size1=temp_trans.internal_size1();
62  vcl_size_t temp_int_size2=temp_trans.internal_size2();
63 
64 #ifdef VIENNACL_WITH_OPENMP
65  #pragma omp parallel for shared(proxy_int_size1,proxy_int_size2,temp_int_size1,temp_int_size2)
66 #endif
67  for (vcl_size_t i = 0; i < proxy_int_size1*proxy_int_size2;++i)
68  {
69  vcl_size_t row = i / proxy_int_size2;
70  vcl_size_t col = i % proxy_int_size2;
71 
72  if (row < proxy.lhs().size1() && col < proxy.lhs().size2())
73  {
74  if (proxy.lhs().row_major())
75  {
76  vcl_size_t pos = row_major::mem_index(proxy.lhs().start1() + proxy.lhs().stride1() * row,
77  proxy.lhs().start2() + proxy.lhs().stride2() * col,
78  proxy_int_size1, proxy_int_size2);
79  vcl_size_t new_pos = row_major::mem_index(temp_trans.start2() + temp_trans.stride2() * col,
80  temp_trans.start1() + temp_trans.stride1() * row, temp_int_size1,
81  temp_int_size2);
82  temp[new_pos] = temp_proxy[pos];
83  }
84  else
85  {
86  vcl_size_t pos = column_major::mem_index(proxy.lhs().start1() + proxy.lhs().stride1() * row,
87  proxy.lhs().start2() + proxy.lhs().stride2() * col, proxy_int_size1,
88  proxy_int_size2);
89  vcl_size_t new_pos = column_major::mem_index(temp_trans.start2() + temp_trans.stride2() * col,
90  temp_trans.start1() + temp_trans.stride1() * row, temp_int_size1,
91  temp_int_size2);
92  temp[new_pos] = temp_proxy[pos];
93  }
94  }
95  }
96 }
97 
98 template<typename NumericT, typename ScalarT1>
100  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
101 {
102  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
103 
104  typedef NumericT value_type;
105 
106  value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
107  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
108 
109  value_type data_alpha = alpha;
110  if (flip_sign_alpha)
111  data_alpha = -data_alpha;
112 
113  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
114  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
115  vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
116  vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
117  vcl_size_t A_size1 = viennacl::traits::size1(mat1);
118  vcl_size_t A_size2 = viennacl::traits::size2(mat1);
119  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
120  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
121 
122  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
123  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
124  vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
125  vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
126  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
127  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
128 
129  if (mat1.row_major())
130  {
131  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
132  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
133 
134  if (reciprocal_alpha)
135  {
136 #ifdef VIENNACL_WITH_OPENMP
137  #pragma omp parallel for
138 #endif
139  for (long row = 0; row < static_cast<long>(A_size1); ++row)
140  for (vcl_size_t col = 0; col < A_size2; ++col)
141  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
142  }
143  else
144  {
145 #ifdef VIENNACL_WITH_OPENMP
146  #pragma omp parallel for
147 #endif
148  for (long row = 0; row < static_cast<long>(A_size1); ++row)
149  for (vcl_size_t col = 0; col < A_size2; ++col)
150  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
151  }
152  }
153  else
154  {
155  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
156  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
157 
158  if (reciprocal_alpha)
159  {
160 #ifdef VIENNACL_WITH_OPENMP
161  #pragma omp parallel for
162 #endif
163  for (long col = 0; col < static_cast<long>(A_size2); ++col)
164  for (vcl_size_t row = 0; row < A_size1; ++row)
165  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
166  }
167  else
168  {
169 #ifdef VIENNACL_WITH_OPENMP
170  #pragma omp parallel for
171 #endif
172  for (long col = 0; col < static_cast<long>(A_size2); ++col)
173  for (vcl_size_t row = 0; row < A_size1; ++row)
174  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
175  }
176  }
177 }
178 
179 
180 template<typename NumericT,
181  typename ScalarT1, typename ScalarT2>
183  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
184  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
185 {
186  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
187 
188  typedef NumericT value_type;
189 
190  value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
191  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
192  value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
193 
194  value_type data_alpha = alpha;
195  if (flip_sign_alpha)
196  data_alpha = -data_alpha;
197 
198  value_type data_beta = beta;
199  if (flip_sign_beta)
200  data_beta = -data_beta;
201 
202  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
203  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
204  vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
205  vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
206  vcl_size_t A_size1 = viennacl::traits::size1(mat1);
207  vcl_size_t A_size2 = viennacl::traits::size2(mat1);
208  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
209  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
210 
211  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
212  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
213  vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
214  vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
215  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
216  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
217 
218  vcl_size_t C_start1 = viennacl::traits::start1(mat3);
219  vcl_size_t C_start2 = viennacl::traits::start2(mat3);
220  vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
221  vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
222  vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
223  vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
224 
225  if (mat1.row_major())
226  {
227  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
228  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
229  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
230 
231  if (reciprocal_alpha && reciprocal_beta)
232  {
233 #ifdef VIENNACL_WITH_OPENMP
234  #pragma omp parallel for
235 #endif
236  for (long row = 0; row < static_cast<long>(A_size1); ++row)
237  for (vcl_size_t col = 0; col < A_size2; ++col)
238  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
239  }
240  else if (reciprocal_alpha && !reciprocal_beta)
241  {
242 #ifdef VIENNACL_WITH_OPENMP
243  #pragma omp parallel for
244 #endif
245  for (long row = 0; row < static_cast<long>(A_size1); ++row)
246  for (vcl_size_t col = 0; col < A_size2; ++col)
247  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
248  }
249  else if (!reciprocal_alpha && reciprocal_beta)
250  {
251 #ifdef VIENNACL_WITH_OPENMP
252  #pragma omp parallel for
253 #endif
254  for (long row = 0; row < static_cast<long>(A_size1); ++row)
255  for (vcl_size_t col = 0; col < A_size2; ++col)
256  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
257  }
258  else if (!reciprocal_alpha && !reciprocal_beta)
259  {
260 #ifdef VIENNACL_WITH_OPENMP
261  #pragma omp parallel for
262 #endif
263  for (long row = 0; row < static_cast<long>(A_size1); ++row)
264  for (vcl_size_t col = 0; col < A_size2; ++col)
265  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
266  }
267  }
268  else
269  {
270  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
271  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
272  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
273 
274  if (reciprocal_alpha && reciprocal_beta)
275  {
276 #ifdef VIENNACL_WITH_OPENMP
277  #pragma omp parallel for
278 #endif
279  for (long col = 0; col < static_cast<long>(A_size2); ++col)
280  for (vcl_size_t row = 0; row < A_size1; ++row)
281  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
282  }
283  else if (reciprocal_alpha && !reciprocal_beta)
284  {
285 #ifdef VIENNACL_WITH_OPENMP
286  #pragma omp parallel for
287 #endif
288  for (long col = 0; col < static_cast<long>(A_size2); ++col)
289  for (vcl_size_t row = 0; row < A_size1; ++row)
290  wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
291  }
292  else if (!reciprocal_alpha && reciprocal_beta)
293  {
294 #ifdef VIENNACL_WITH_OPENMP
295  #pragma omp parallel for
296 #endif
297  for (long col = 0; col < static_cast<long>(A_size2); ++col)
298  for (vcl_size_t row = 0; row < A_size1; ++row)
299  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
300  }
301  else if (!reciprocal_alpha && !reciprocal_beta)
302  {
303 #ifdef VIENNACL_WITH_OPENMP
304  #pragma omp parallel for
305 #endif
306  for (long col = 0; col < static_cast<long>(A_size2); ++col)
307  for (vcl_size_t row = 0; row < A_size1; ++row)
308  wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
309  }
310  }
311 
312 }
313 
314 
315 template<typename NumericT,
316  typename ScalarT1, typename ScalarT2>
318  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
319  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
320 {
321  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
322 
323  typedef NumericT value_type;
324 
325  value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
326  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
327  value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
328 
329  value_type data_alpha = alpha;
330  if (flip_sign_alpha)
331  data_alpha = -data_alpha;
332 
333  value_type data_beta = beta;
334  if (flip_sign_beta)
335  data_beta = -data_beta;
336 
337  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
338  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
339  vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
340  vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
341  vcl_size_t A_size1 = viennacl::traits::size1(mat1);
342  vcl_size_t A_size2 = viennacl::traits::size2(mat1);
343  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
344  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
345 
346  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
347  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
348  vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
349  vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
350  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
351  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
352 
353  vcl_size_t C_start1 = viennacl::traits::start1(mat3);
354  vcl_size_t C_start2 = viennacl::traits::start2(mat3);
355  vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
356  vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
357  vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
358  vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
359 
360  if (mat1.row_major())
361  {
362  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
363  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
364  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
365 
366  if (reciprocal_alpha && reciprocal_beta)
367  {
368 #ifdef VIENNACL_WITH_OPENMP
369  #pragma omp parallel for
370 #endif
371  for (long row = 0; row < static_cast<long>(A_size1); ++row)
372  for (vcl_size_t col = 0; col < A_size2; ++col)
373  wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
374  }
375  else if (reciprocal_alpha && !reciprocal_beta)
376  {
377 #ifdef VIENNACL_WITH_OPENMP
378  #pragma omp parallel for
379 #endif
380  for (long row = 0; row < static_cast<long>(A_size1); ++row)
381  for (vcl_size_t col = 0; col < A_size2; ++col)
382  wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
383  }
384  else if (!reciprocal_alpha && reciprocal_beta)
385  {
386 #ifdef VIENNACL_WITH_OPENMP
387  #pragma omp parallel for
388 #endif
389  for (long row = 0; row < static_cast<long>(A_size1); ++row)
390  for (vcl_size_t col = 0; col < A_size2; ++col)
391  wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
392  }
393  else if (!reciprocal_alpha && !reciprocal_beta)
394  {
395 #ifdef VIENNACL_WITH_OPENMP
396  #pragma omp parallel for
397 #endif
398  for (long row = 0; row < static_cast<long>(A_size1); ++row)
399  for (vcl_size_t col = 0; col < A_size2; ++col)
400  wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
401  }
402  }
403  else
404  {
405  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
406  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
407  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
408 
409  if (reciprocal_alpha && reciprocal_beta)
410  {
411 #ifdef VIENNACL_WITH_OPENMP
412  #pragma omp parallel for
413 #endif
414  for (long col = 0; col < static_cast<long>(A_size2); ++col)
415  for (vcl_size_t row = 0; row < A_size1; ++row)
416  wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
417  }
418  else if (reciprocal_alpha && !reciprocal_beta)
419  {
420 #ifdef VIENNACL_WITH_OPENMP
421  #pragma omp parallel for
422 #endif
423  for (long col = 0; col < static_cast<long>(A_size2); ++col)
424  for (vcl_size_t row = 0; row < A_size1; ++row)
425  wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
426  }
427  else if (!reciprocal_alpha && reciprocal_beta)
428  {
429 #ifdef VIENNACL_WITH_OPENMP
430  #pragma omp parallel for
431 #endif
432  for (long col = 0; col < static_cast<long>(A_size2); ++col)
433  for (vcl_size_t row = 0; row < A_size1; ++row)
434  wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
435  }
436  else if (!reciprocal_alpha && !reciprocal_beta)
437  {
438 #ifdef VIENNACL_WITH_OPENMP
439  #pragma omp parallel for
440 #endif
441  for (long col = 0; col < static_cast<long>(A_size2); ++col)
442  for (vcl_size_t row = 0; row < A_size1; ++row)
443  wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
444  }
445  }
446 
447 }
448 
449 
450 
451 
452 template<typename NumericT>
453 void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
454 {
455  typedef NumericT value_type;
456 
457  value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
458  value_type alpha = static_cast<value_type>(s);
459 
460  vcl_size_t A_start1 = viennacl::traits::start1(mat);
461  vcl_size_t A_start2 = viennacl::traits::start2(mat);
466  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
467  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
468 
469  if (mat.row_major())
470  {
471  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
472 
473 #ifdef VIENNACL_WITH_OPENMP
474  #pragma omp parallel for
475 #endif
476  for (long row = 0; row < static_cast<long>(A_size1); ++row)
477  for (vcl_size_t col = 0; col < A_size2; ++col)
478  wrapper_A(static_cast<vcl_size_t>(row), col) = alpha;
479  //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
480  // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
481  }
482  else
483  {
484  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
485 
486 #ifdef VIENNACL_WITH_OPENMP
487  #pragma omp parallel for
488 #endif
489  for (long col = 0; col < static_cast<long>(A_size2); ++col)
490  for (vcl_size_t row = 0; row < A_size1; ++row)
491  wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha;
492  //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
493  // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
494  }
495 }
496 
497 
498 
499 template<typename NumericT>
501 {
502  typedef NumericT value_type;
503 
504  value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
505  value_type alpha = static_cast<value_type>(s);
506 
507  vcl_size_t A_start1 = viennacl::traits::start1(mat);
508  vcl_size_t A_start2 = viennacl::traits::start2(mat);
511  vcl_size_t A_size1 = viennacl::traits::size1(mat);
512  //vcl_size_t A_size2 = viennacl::traits::size2(mat);
513  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
514  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
515 
516  if (mat.row_major())
517  {
518  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
519 
520 #ifdef VIENNACL_WITH_OPENMP
521  #pragma omp parallel for
522 #endif
523  for (long row = 0; row < static_cast<long>(A_size1); ++row)
524  wrapper_A(row, row) = alpha;
525  }
526  else
527  {
528  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
529 
530 #ifdef VIENNACL_WITH_OPENMP
531  #pragma omp parallel for
532 #endif
533  for (long row = 0; row < static_cast<long>(A_size1); ++row)
534  wrapper_A(row, row) = alpha;
535  }
536 }
537 
538 template<typename NumericT>
540 {
541  typedef NumericT value_type;
542 
543  value_type *data_A = detail::extract_raw_pointer<value_type>(mat);
544  value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec);
545 
546  vcl_size_t A_start1 = viennacl::traits::start1(mat);
547  vcl_size_t A_start2 = viennacl::traits::start2(mat);
550  //vcl_size_t A_size1 = viennacl::traits::size1(mat);
551  //vcl_size_t A_size2 = viennacl::traits::size2(mat);
552  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
553  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
554 
555  vcl_size_t v_start = viennacl::traits::start(vec);
557  vcl_size_t v_size = viennacl::traits::size(vec);
558 
559  vcl_size_t row_start = 0;
560  vcl_size_t col_start = 0;
561 
562  if (k >= 0)
563  col_start = static_cast<vcl_size_t>(k);
564  else
565  row_start = static_cast<vcl_size_t>(-k);
566 
567  matrix_assign(mat, NumericT(0));
568 
569  if (mat.row_major())
570  {
571  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
572 
573  for (vcl_size_t i = 0; i < v_size; ++i)
574  wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
575  }
576  else
577  {
578  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
579 
580  for (vcl_size_t i = 0; i < v_size; ++i)
581  wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
582  }
583 }
584 
585 template<typename NumericT>
587 {
588  typedef NumericT value_type;
589 
590  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
591  value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
592 
593  vcl_size_t A_start1 = viennacl::traits::start1(mat);
594  vcl_size_t A_start2 = viennacl::traits::start2(mat);
597  //vcl_size_t A_size1 = viennacl::traits::size1(mat);
598  //vcl_size_t A_size2 = viennacl::traits::size2(mat);
599  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
600  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
601 
602  vcl_size_t v_start = viennacl::traits::start(vec);
604  vcl_size_t v_size = viennacl::traits::size(vec);
605 
606  vcl_size_t row_start = 0;
607  vcl_size_t col_start = 0;
608 
609  if (k >= 0)
610  col_start = static_cast<vcl_size_t>(k);
611  else
612  row_start = static_cast<vcl_size_t>(-k);
613 
614  if (mat.row_major())
615  {
616  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
617 
618  for (vcl_size_t i = 0; i < v_size; ++i)
619  data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
620  }
621  else
622  {
623  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
624 
625  for (vcl_size_t i = 0; i < v_size; ++i)
626  data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
627  }
628 }
629 
630 template<typename NumericT>
631 void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec)
632 {
633  typedef NumericT value_type;
634 
635  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
636  value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
637 
638  vcl_size_t A_start1 = viennacl::traits::start1(mat);
639  vcl_size_t A_start2 = viennacl::traits::start2(mat);
642  //vcl_size_t A_size1 = viennacl::traits::size1(mat);
643  //vcl_size_t A_size2 = viennacl::traits::size2(mat);
644  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
645  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
646 
647  vcl_size_t v_start = viennacl::traits::start(vec);
649  vcl_size_t v_size = viennacl::traits::size(vec);
650 
651  if (mat.row_major())
652  {
653  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
654 
655  for (vcl_size_t j = 0; j < v_size; ++j)
656  data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
657  }
658  else
659  {
660  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
661 
662  for (vcl_size_t j = 0; j < v_size; ++j)
663  data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
664  }
665 }
666 
667 template<typename NumericT>
668 void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
669 {
670  typedef NumericT value_type;
671 
672  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
673  value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
674 
675  vcl_size_t A_start1 = viennacl::traits::start1(mat);
676  vcl_size_t A_start2 = viennacl::traits::start2(mat);
679  //vcl_size_t A_size1 = viennacl::traits::size1(mat);
680  //vcl_size_t A_size2 = viennacl::traits::size2(mat);
681  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
682  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
683 
684  vcl_size_t v_start = viennacl::traits::start(vec);
686  vcl_size_t v_size = viennacl::traits::size(vec);
687 
688  if (mat.row_major())
689  {
690  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
691 
692  for (vcl_size_t i = 0; i < v_size; ++i)
693  data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
694  }
695  else
696  {
697  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
698 
699  for (vcl_size_t i = 0; i < v_size; ++i)
700  data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
701  }
702 }
703 
704 //
706 //
707 
708 // Binary operations A = B .* C and A = B ./ C
709 
715 template<typename NumericT, typename OpT>
718 {
719  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
720 
721  typedef NumericT value_type;
723 
724  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
725  value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
726  value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs());
727 
728  vcl_size_t A_start1 = viennacl::traits::start1(A);
729  vcl_size_t A_start2 = viennacl::traits::start2(A);
732  vcl_size_t A_size1 = viennacl::traits::size1(A);
733  vcl_size_t A_size2 = viennacl::traits::size2(A);
734  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
735  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
736 
737  vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
738  vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
739  vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
740  vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
741  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
742  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
743 
744  vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs());
745  vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs());
746  vcl_size_t C_inc1 = viennacl::traits::stride1(proxy.rhs());
747  vcl_size_t C_inc2 = viennacl::traits::stride2(proxy.rhs());
748  vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(proxy.rhs());
749  vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(proxy.rhs());
750 
751  if (A.row_major())
752  {
753  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
754  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
755  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
756 
757 #ifdef VIENNACL_WITH_OPENMP
758  #pragma omp parallel for
759 #endif
760  for (long row = 0; row < static_cast<long>(A_size1); ++row)
761  for (vcl_size_t col = 0; col < A_size2; ++col)
762  OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
763  //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
764  // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
765  // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
766  }
767  else
768  {
769  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
770  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
771  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
772 
773 #ifdef VIENNACL_WITH_OPENMP
774  #pragma omp parallel for
775 #endif
776  for (long col = 0; col < static_cast<long>(A_size2); ++col)
777  for (vcl_size_t row = 0; row < A_size1; ++row)
778  OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
779 
780  //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
781  // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
782  // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
783  }
784 }
785 
786 // Unary operations
787 
788 // A = op(B)
789 template<typename NumericT, typename OpT>
792 {
793  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
794 
795  typedef NumericT value_type;
797 
798  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
799  value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
800 
801  vcl_size_t A_start1 = viennacl::traits::start1(A);
802  vcl_size_t A_start2 = viennacl::traits::start2(A);
805  vcl_size_t A_size1 = viennacl::traits::size1(A);
806  vcl_size_t A_size2 = viennacl::traits::size2(A);
807  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
808  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
809 
810  vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
811  vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
812  vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
813  vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
814  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
815  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
816 
817  if (A.row_major())
818  {
819  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
820  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
821 
822 #ifdef VIENNACL_WITH_OPENMP
823  #pragma omp parallel for
824 #endif
825  for (long row = 0; row < static_cast<long>(A_size1); ++row)
826  for (vcl_size_t col = 0; col < A_size2; ++col)
827  OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
828  }
829  else
830  {
831  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
832  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
833 
834 #ifdef VIENNACL_WITH_OPENMP
835  #pragma omp parallel for
836 #endif
837  for (long col = 0; col < static_cast<long>(A_size2); ++col)
838  for (vcl_size_t row = 0; row < A_size1; ++row)
839  OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
840  }
841 }
842 
843 
844 
845 //
847 //
848 
849 // A * x
850 
860 template<typename NumericT>
861 void prod_impl(const matrix_base<NumericT> & mat, bool trans,
862  const vector_base<NumericT> & vec,
863  vector_base<NumericT> & result)
864 {
865  typedef NumericT value_type;
866 
867  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
868  value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
869  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
870 
871  vcl_size_t A_start1 = viennacl::traits::start1(mat);
872  vcl_size_t A_start2 = viennacl::traits::start2(mat);
875  vcl_size_t A_size1 = viennacl::traits::size1(mat);
876  vcl_size_t A_size2 = viennacl::traits::size2(mat);
877  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
878  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
879 
882 
884  vcl_size_t inc2 = viennacl::traits::stride(result);
885 
886  if (mat.row_major())
887  {
888  if (trans)
889  {
890  {
891  value_type temp = data_x[start1];
892  for (vcl_size_t row = 0; row < A_size2; ++row)
893  data_result[row * inc2 + start2] = data_A[viennacl::row_major::mem_index(A_start1, row * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
894  }
895 
896  for (vcl_size_t col = 1; col < A_size1; ++col) //run through matrix sequentially
897  {
898  value_type temp = data_x[col * inc1 + start1];
899  for (vcl_size_t row = 0; row < A_size2; ++row)
900  {
901  data_result[row * inc2 + start2] += data_A[viennacl::row_major::mem_index(col * A_inc1 + A_start1, row * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
902  }
903  }
904  }
905  else
906  {
907 #ifdef VIENNACL_WITH_OPENMP
908  #pragma omp parallel for
909 #endif
910  for (long row = 0; row < static_cast<long>(A_size1); ++row)
911  {
912  value_type temp = 0;
913  for (vcl_size_t col = 0; col < A_size2; ++col)
914  temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
915 
916  data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
917  }
918  }
919  }
920  else
921  {
922  if (!trans)
923  {
924  {
925  value_type temp = data_x[start1];
926  for (vcl_size_t row = 0; row < A_size1; ++row)
927  data_result[row * inc2 + start2] = data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, A_start2, A_internal_size1, A_internal_size2)] * temp;
928  }
929  for (vcl_size_t col = 1; col < A_size2; ++col) //run through matrix sequentially
930  {
931  value_type temp = data_x[col * inc1 + start1];
932  for (vcl_size_t row = 0; row < A_size1; ++row)
933  data_result[row * inc2 + start2] += data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
934  }
935  }
936  else
937  {
938 #ifdef VIENNACL_WITH_OPENMP
939  #pragma omp parallel for
940 #endif
941  for (long row = 0; row < static_cast<long>(A_size2); ++row)
942  {
943  value_type temp = 0;
944  for (vcl_size_t col = 0; col < A_size1; ++col)
945  temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
946 
947  data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
948  }
949  }
950  }
951 }
952 
953 
954 
955 //
957 //
958 
959 namespace detail
960 {
961  template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT>
962  void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C,
963  vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2,
964  NumericT alpha, NumericT beta)
965  {
966  if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0)
967  return;
968 
969  static const vcl_size_t blocksize = 64;
970 
971  vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1;
972  vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1;
973  vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1;
974 
975  //
976  // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C:
977  //
978 #ifdef VIENNACL_WITH_OPENMP
979  #pragma omp parallel for
980 #endif
981  for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2)
982  {
983  // thread-local auxiliary buffers
984  std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major
985  std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major
986  std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major
987 
988  vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2);
989  for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j)
990  {
991  // Reset block matrix:
992  std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0));
993 
994  vcl_size_t offset_i = block_idx_i*blocksize;
995  vcl_size_t offset_j = block_idx_j*blocksize;
996 
997  // C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j)
998  for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k)
999  {
1000  // flush buffers:
1001  std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0));
1002  std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0));
1003 
1004  vcl_size_t offset_k = block_idx_k*blocksize;
1005 
1006  // load current data:
1007  for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
1008  for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
1009  buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k);
1010 
1011  for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
1012  for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
1013  buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j);
1014 
1015  // multiply (this is the hot spot in terms of flops)
1016  for (vcl_size_t i = 0; i < blocksize; ++i)
1017  {
1018  NumericT const * ptrA = &(buffer_A[i*blocksize]);
1019  for (vcl_size_t j = 0; j < blocksize; ++j)
1020  {
1021  NumericT const * ptrB = &(buffer_B[j*blocksize]);
1022 
1023  NumericT temp = NumericT(0);
1024  for (vcl_size_t k = 0; k < blocksize; ++k)
1025  temp += ptrA[k] * ptrB[k]; // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize];
1026 
1027  buffer_C[i*blocksize + j] += temp;
1028  }
1029  }
1030  }
1031 
1032  // write result:
1033  if (beta > 0 || beta < 0)
1034  {
1035  for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
1036  for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
1037  C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
1038  }
1039  else
1040  {
1041  for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
1042  for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
1043  C(i,j) = alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
1044  }
1045 
1046  } // for block j
1047  } // for block i
1048 
1049  } // prod()
1050 
1051 } // namespace detail
1052 
1058 template<typename NumericT, typename ScalarT1, typename ScalarT2 >
1059 void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
1060  const matrix_base<NumericT> & B, bool trans_B,
1062  ScalarT1 alpha,
1063  ScalarT2 beta)
1064 {
1065  typedef NumericT value_type;
1066 
1067  value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
1068  value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
1069  value_type * data_C = detail::extract_raw_pointer<value_type>(C);
1070 
1071  vcl_size_t A_start1 = viennacl::traits::start1(A);
1072  vcl_size_t A_start2 = viennacl::traits::start2(A);
1075  vcl_size_t A_size1 = viennacl::traits::size1(A);
1076  vcl_size_t A_size2 = viennacl::traits::size2(A);
1077  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
1078  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
1079 
1080  vcl_size_t B_start1 = viennacl::traits::start1(B);
1081  vcl_size_t B_start2 = viennacl::traits::start2(B);
1084  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B);
1085  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B);
1086 
1087  vcl_size_t C_start1 = viennacl::traits::start1(C);
1088  vcl_size_t C_start2 = viennacl::traits::start2(C);
1091  vcl_size_t C_size1 = viennacl::traits::size1(C);
1092  vcl_size_t C_size2 = viennacl::traits::size2(C);
1093  vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(C);
1094  vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(C);
1095 
1096  if (!trans_A && !trans_B)
1097  {
1098  if (A.row_major() && B.row_major() && C.row_major())
1099  {
1100  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1101  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1102  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1103 
1104  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1105  }
1106  else if (A.row_major() && B.row_major() && !C.row_major())
1107  {
1108  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1109  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1110  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1111 
1112  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1113  }
1114  else if (A.row_major() && !B.row_major() && C.row_major())
1115  {
1116  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1117  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1118  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1119 
1120  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1121  }
1122  else if (A.row_major() && !B.row_major() && !C.row_major())
1123  {
1124  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1125  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1126  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1127 
1128  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1129  }
1130  else if (!A.row_major() && B.row_major() && C.row_major())
1131  {
1132  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1133  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1134  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1135 
1136  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1137  }
1138  else if (!A.row_major() && B.row_major() && !C.row_major())
1139  {
1140  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1141  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1142  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1143 
1144  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1145  }
1146  else if (!A.row_major() && !B.row_major() && C.row_major())
1147  {
1148  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1149  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1150  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1151 
1152  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1153  }
1154  else
1155  {
1156  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1157  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1158  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1159 
1160  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1161  }
1162  }
1163  else if (!trans_A && trans_B)
1164  {
1165  if (A.row_major() && B.row_major() && C.row_major())
1166  {
1167  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1168  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1169  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1170 
1171  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1172  }
1173  else if (A.row_major() && B.row_major() && !C.row_major())
1174  {
1175  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1176  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1177  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1178 
1179  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1180  }
1181  else if (A.row_major() && !B.row_major() && C.row_major())
1182  {
1183  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1184  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1185  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1186 
1187  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1188  }
1189  else if (A.row_major() && !B.row_major() && !C.row_major())
1190  {
1191  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1192  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1193  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1194 
1195  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1196  }
1197  else if (!A.row_major() && B.row_major() && C.row_major())
1198  {
1199  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1200  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1201  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1202 
1203  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1204  }
1205  else if (!A.row_major() && B.row_major() && !C.row_major())
1206  {
1207  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1208  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1209  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1210 
1211  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1212  }
1213  else if (!A.row_major() && !B.row_major() && C.row_major())
1214  {
1215  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1216  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1217  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1218 
1219  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1220  }
1221  else
1222  {
1223  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1224  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1225  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1226 
1227  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1228  }
1229  }
1230  else if (trans_A && !trans_B)
1231  {
1232  if (A.row_major() && B.row_major() && C.row_major())
1233  {
1234  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1235  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1236  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1237 
1238  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1239  }
1240  else if (A.row_major() && B.row_major() && !C.row_major())
1241  {
1242  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1243  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1244  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1245 
1246  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1247  }
1248  else if (A.row_major() && !B.row_major() && C.row_major())
1249  {
1250  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1251  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1252  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1253 
1254  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1255  }
1256  else if (A.row_major() && !B.row_major() && !C.row_major())
1257  {
1258  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1259  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1260  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1261 
1262  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1263  }
1264  else if (!A.row_major() && B.row_major() && C.row_major())
1265  {
1266  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1267  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1268  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1269 
1270  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1271  }
1272  else if (!A.row_major() && B.row_major() && !C.row_major())
1273  {
1274  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1275  detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1276  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1277 
1278  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1279  }
1280  else if (!A.row_major() && !B.row_major() && C.row_major())
1281  {
1282  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1283  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1284  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1285 
1286  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1287  }
1288  else
1289  {
1290  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1291  detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1292  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1293 
1294  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1295  }
1296  }
1297  else if (trans_A && trans_B)
1298  {
1299  if (A.row_major() && B.row_major() && C.row_major())
1300  {
1301  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1302  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1303  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1304 
1305  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1306  }
1307  else if (A.row_major() && B.row_major() && !C.row_major())
1308  {
1309  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1310  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1311  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1312 
1313  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1314  }
1315  else if (A.row_major() && !B.row_major() && C.row_major())
1316  {
1317  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1318  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1319  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1320 
1321  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1322  }
1323  else if (A.row_major() && !B.row_major() && !C.row_major())
1324  {
1325  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1326  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1327  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1328 
1329  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1330  }
1331  else if (!A.row_major() && B.row_major() && C.row_major())
1332  {
1333  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1334  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1335  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1336 
1337  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1338  }
1339  else if (!A.row_major() && B.row_major() && !C.row_major())
1340  {
1341  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1342  detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1343  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1344 
1345  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1346  }
1347  else if (!A.row_major() && !B.row_major() && C.row_major())
1348  {
1349  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1350  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1351  detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1352 
1353  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1354  }
1355  else
1356  {
1357  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
1358  detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
1359  detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
1360 
1361  detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
1362  }
1363  }
1364 }
1365 
1366 
1367 
1368 
1369 //
1371 //
1372 
1373 
1385 template<typename NumericT, typename ScalarT>
1387  ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
1388  const vector_base<NumericT> & vec1,
1389  const vector_base<NumericT> & vec2)
1390 {
1391  typedef NumericT value_type;
1392 
1393  value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
1394  value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1);
1395  value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2);
1396 
1397  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
1398  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
1399  vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
1400  vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
1401  vcl_size_t A_size1 = viennacl::traits::size1(mat1);
1402  vcl_size_t A_size2 = viennacl::traits::size2(mat1);
1403  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
1404  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
1405 
1407  vcl_size_t inc1 = viennacl::traits::stride(vec1);
1408 
1410  vcl_size_t inc2 = viennacl::traits::stride(vec2);
1411 
1412  value_type data_alpha = alpha;
1413  if (flip_sign_alpha)
1414  data_alpha = -data_alpha;
1415  if (reciprocal_alpha)
1416  data_alpha = static_cast<value_type>(1) / data_alpha;
1417 
1418  if (mat1.row_major())
1419  {
1420  for (vcl_size_t row = 0; row < A_size1; ++row)
1421  {
1422  value_type value_v1 = data_alpha * data_v1[row * inc1 + start1];
1423  for (vcl_size_t col = 0; col < A_size2; ++col)
1424  data_A[viennacl::row_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
1425  }
1426  }
1427  else
1428  {
1429  for (vcl_size_t col = 0; col < A_size2; ++col) //run through matrix sequentially
1430  {
1431  value_type value_v2 = data_alpha * data_v2[col * inc2 + start2];
1432  for (vcl_size_t row = 0; row < A_size1; ++row)
1433  data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2;
1434  }
1435  }
1436 }
1437 
1438 
1446 template <typename NumericT, typename S1>
1448  vector_base<S1> & D,
1449  vector_base<S1> & S
1450  )
1451 
1452  {
1453  typedef NumericT value_type;
1454 
1455  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
1456  value_type * data_D = detail::extract_raw_pointer<value_type>(D);
1457  value_type * data_S = detail::extract_raw_pointer<value_type>(S);
1458 
1459  vcl_size_t A_start1 = viennacl::traits::start1(A);
1460  vcl_size_t A_start2 = viennacl::traits::start2(A);
1463  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
1464  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
1465 
1469 
1473 
1474  vcl_size_t size = std::min(size1, size2);
1475  if (A.row_major())
1476  {
1477 #ifdef VIENNACL_WITH_OPENMP
1478  #pragma omp parallel for
1479 #endif
1480  for(vcl_size_t i = 0; i < size -1; i++)
1481  {
1482 
1483  data_D[start1 + inc1 * i] = data_A[viennacl::row_major::mem_index(i * A_inc1 + A_start1, i * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1484  data_S[start2 + inc2 * (i + 1)] = data_A[viennacl::row_major::mem_index(i * A_inc1 + A_start1, (i + 1) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1485  }
1486  data_D[start1 + inc1 * (size-1)] = data_A[viennacl::row_major::mem_index((size-1) * A_inc1 + A_start1, (size-1) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1487 
1488  }
1489  else
1490  {
1491 #ifdef VIENNACL_WITH_OPENMP
1492  #pragma omp parallel for
1493 #endif
1494  for(vcl_size_t i = 0; i < size -1; i++)
1495  {
1496  data_D[start1 + inc1 * i] = data_A[viennacl::column_major::mem_index(i * A_inc1 + A_start1, i * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1497  data_S[start2 + inc2 * (i + 1)] = data_A[viennacl::column_major::mem_index(i * A_inc1 + A_start1, (i + 1) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1498  }
1499  data_D[start1 + inc1 * (size-1)] = data_A[viennacl::column_major::mem_index((size-1) * A_inc1 + A_start1, (size-1) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1500  }
1501 
1502  }
1503 
1504 
1505 
1506  template <typename NumericT, typename VectorType>
1508  VectorType & dh,
1509  VectorType & sh
1510  )
1511  {
1512 
1514 
1515  }
1516 
1523  template <typename NumericT>
1526  vcl_size_t start)
1527  {
1528  typedef NumericT value_type;
1529  NumericT ss = 0;
1530  vcl_size_t row_start = start + 1;
1531 
1532  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
1533  value_type * data_D = detail::extract_raw_pointer<value_type>(D);
1534 
1535  vcl_size_t A_start1 = viennacl::traits::start1(A);
1536  vcl_size_t A_start2 = viennacl::traits::start2(A);
1539  vcl_size_t A_size1 = viennacl::traits::size1(A);
1540  vcl_size_t A_size2 = viennacl::traits::size2(A);
1541  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
1542  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
1543 
1546 
1547  if (A.row_major())
1548  {
1549  for(vcl_size_t i = 0; i < A_size2; i++)
1550  {
1551  ss = 0;
1552  for(vcl_size_t j = row_start; j < A_size1; j++)
1553  ss = ss + data_D[start1 + inc1 * j] * data_A[viennacl::row_major::mem_index((j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1554 #ifdef VIENNACL_WITH_OPENMP
1555  #pragma omp parallel for
1556 #endif
1557  for(long j = static_cast<long>(row_start); j < static_cast<long>(A_size1); j++)
1558  data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] =
1559  data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] -
1560  (2 * data_D[start1 + inc1 * static_cast<vcl_size_t>(j)]* ss);
1561  }
1562  }
1563  else
1564  {
1565  for(vcl_size_t i = 0; i < A_size2; i++)
1566  {
1567  ss = 0;
1568  for(vcl_size_t j = row_start; j < A_size1; j++)
1569  ss = ss + data_D[start1 + inc1 * j] * data_A[viennacl::column_major::mem_index((j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1570 #ifdef VIENNACL_WITH_OPENMP
1571  #pragma omp parallel for
1572 #endif
1573  for(long j = static_cast<long>(row_start); j < static_cast<long>(A_size1); j++)
1574  data_A[viennacl::column_major::mem_index(static_cast<vcl_size_t>(j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]=
1575  data_A[viennacl::column_major::mem_index(static_cast<vcl_size_t>(j) * A_inc1 + A_start1, (i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] -
1576  (2 * data_D[start1 + inc1 * static_cast<vcl_size_t>(j)]* ss);
1577  }
1578  }
1579 
1580  }
1581 
1588  template <typename NumericT>
1591  {
1592  typedef NumericT value_type;
1593  NumericT ss = 0;
1594 
1595  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
1596  value_type * data_D = detail::extract_raw_pointer<value_type>(D);
1597 
1598  vcl_size_t A_start1 = viennacl::traits::start1(A);
1599  vcl_size_t A_start2 = viennacl::traits::start2(A);
1602  vcl_size_t A_size1 = viennacl::traits::size1(A);
1603  vcl_size_t A_size2 = viennacl::traits::size2(A);
1604  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
1605  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
1606 
1609 
1610  if (A.row_major())
1611  {
1612  for(vcl_size_t i = 0; i < A_size1; i++)
1613  {
1614  ss = 0;
1615  for(vcl_size_t j = 0; j < A_size2; j++) // ss = ss + D[j] * A(i, j)
1616  ss = ss + (data_D[start1 + inc1 * j] * data_A[viennacl::row_major::mem_index((i) * A_inc1 + A_start1, (j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]);
1617 
1618  NumericT sum_Av = ss;
1619 #ifdef VIENNACL_WITH_OPENMP
1620  #pragma omp parallel for
1621 #endif
1622  for(long j = 0; j < static_cast<long>(A_size2); j++) // A(i, j) = A(i, j) - 2 * D[j] * sum_Av
1623  data_A[viennacl::row_major::mem_index((i) * A_inc1 + A_start1, static_cast<vcl_size_t>(j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] =
1624  data_A[viennacl::row_major::mem_index((i) * A_inc1 + A_start1, static_cast<vcl_size_t>(j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - (2 * data_D[start1 + inc1 * static_cast<vcl_size_t>(j)] * sum_Av);
1625  }
1626  }
1627  else
1628  {
1629  for(vcl_size_t i = 0; i < A_size1; i++)
1630  {
1631  ss = 0;
1632  for(vcl_size_t j = 0; j < A_size2; j++) // ss = ss + D[j] * A(i, j)
1633  ss = ss + (data_D[start1 + inc1 * j] * data_A[viennacl::column_major::mem_index((i) * A_inc1 + A_start1, (j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]);
1634 
1635  NumericT sum_Av = ss;
1636 #ifdef VIENNACL_WITH_OPENMP
1637  #pragma omp parallel for
1638 #endif
1639  for(long j = 0; j < static_cast<long>(A_size2); j++) // A(i, j) = A(i, j) - 2 * D[j] * sum_Av
1640  data_A[viennacl::column_major::mem_index((i) * A_inc1 + A_start1, static_cast<vcl_size_t>(j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] =
1641  data_A[viennacl::column_major::mem_index((i) * A_inc1 + A_start1, static_cast<vcl_size_t>(j) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - (2 * data_D[start1 + inc1 * static_cast<vcl_size_t>(j)] * sum_Av);
1642  }
1643  }
1644 
1645 
1646  }
1647 
1654  template <typename NumericT>
1657  vcl_size_t A_size1)
1658 
1659  {
1660  NumericT beta = 2;
1662  viennacl::matrix<NumericT> Q_temp = Q;
1663  viennacl::vector<NumericT> vcl_D = D;
1664 
1665 
1666  viennacl::linalg::host_based::scaled_rank_1_update(vcl_P, beta, 1, 0, 1, vcl_D, vcl_D);
1667  Q = prod(Q_temp, vcl_P);
1668 
1669  }
1670 
1680  template<typename NumericT>
1682  vector_base<NumericT> & tmp1,
1683  vector_base<NumericT> & tmp2,
1684  int l,
1685  int m
1686  )
1687  {
1688  typedef NumericT value_type;
1689 
1690  value_type * data_Q = detail::extract_raw_pointer<value_type>(Q);
1691  value_type * data_tmp1 = detail::extract_raw_pointer<value_type>(tmp1);
1692  value_type * data_tmp2 = detail::extract_raw_pointer<value_type>(tmp2);
1693 
1694  vcl_size_t Q_start1 = viennacl::traits::start1(Q);
1695  vcl_size_t Q_start2 = viennacl::traits::start2(Q);
1698  vcl_size_t Q_size1 = viennacl::traits::size1(Q);
1699  vcl_size_t Q_internal_size1 = viennacl::traits::internal_size1(Q);
1700  vcl_size_t Q_internal_size2 = viennacl::traits::internal_size2(Q);
1701 
1703  vcl_size_t inc1 = viennacl::traits::stride(tmp1);
1704 
1706  vcl_size_t inc2 = viennacl::traits::stride(tmp2);
1707 
1708  if (Q.row_major())
1709  {
1710  for( int i = m - 1; i >= l; i--)
1711  {
1712 #ifdef VIENNACL_WITH_OPENMP
1713  #pragma omp parallel for
1714 #endif
1715  for(long k = 0; k < static_cast<long>(Q_size1); k++)
1716  {
1717 
1718  // h = data_Q(k, i+1);
1719  NumericT h = data_Q[viennacl::row_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i + 1) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)];
1720 
1721  // Q(k, i+1) = tmp2[i] * Q(k, i) + tmp1[i]*h;
1722  data_Q[viennacl::row_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i + 1) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] = data_tmp2[start2 + inc2 * vcl_size_t(i)] *
1723  data_Q[viennacl::row_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] + data_tmp1[start1 + inc1 * vcl_size_t(i)] * h;
1724 
1725  // Q(k, i) = tmp1[i] * Q(k, i) - tmp2[i]*h;
1726  data_Q[viennacl::row_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] = data_tmp1[start1 + inc1 * vcl_size_t(i)] *
1727  data_Q[viennacl::row_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] - data_tmp2[start2 + inc2 * vcl_size_t(i)]*h;
1728  }
1729  }
1730  }
1731  else // column_major
1732  {
1733  for( int i = m - 1; i >= l; i--)
1734  {
1735 #ifdef VIENNACL_WITH_OPENMP
1736  #pragma omp parallel for
1737 #endif
1738  for(long k = 0; k < static_cast<long>(Q_size1); k++)
1739  {
1740 
1741  // h = data_Q(k, i+1);
1742  NumericT h = data_Q[viennacl::column_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i + 1) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)];
1743 
1744  // Q(k, i+1) = tmp2[i] * Q(k, i) + tmp1[i]*h;
1745  data_Q[viennacl::column_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i + 1) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] = data_tmp2[start2 + inc2 * vcl_size_t(i)] *
1746  data_Q[viennacl::column_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] + data_tmp1[start1 + inc1 * vcl_size_t(i)] * h;
1747 
1748  // Q(k, i) = tmp1[i] * Q(k, i) - tmp2[i]*h;
1749  data_Q[viennacl::column_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] = data_tmp1[start1 + inc1 * vcl_size_t(i)] *
1750  data_Q[viennacl::column_major::mem_index(static_cast<vcl_size_t>(k) * Q_inc1 + Q_start1, vcl_size_t(i) * Q_inc2 + Q_start2, Q_internal_size1, Q_internal_size2)] - data_tmp2[start2 + inc2 * vcl_size_t(i)]*h;
1751  }
1752  }
1753  }
1754 
1755  }
1756 
1757 
1767  template <typename NumericT, typename S1>
1769  vector_base<S1> & V,
1770  vcl_size_t row_start,
1771  vcl_size_t col_start,
1772  bool copy_col
1773  )
1774  {
1775  typedef NumericT value_type;
1776 
1777  value_type * data_A = detail::extract_raw_pointer<value_type>(A);
1778  value_type * data_V = detail::extract_raw_pointer<value_type>(V);
1779 
1780  vcl_size_t A_start1 = viennacl::traits::start1(A);
1781  vcl_size_t A_start2 = viennacl::traits::start2(A);
1784  vcl_size_t A_size1 = viennacl::traits::size1(A);
1785  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
1786  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
1787 
1788 
1789  if(copy_col)
1790  {
1791  if (A.row_major())
1792  {
1793 #ifdef VIENNACL_WITH_OPENMP
1794  #pragma omp parallel for
1795 #endif
1796  for(long i = static_cast<long>(row_start); i < static_cast<long>(A_size1); i++)
1797  {
1798  data_V[i - static_cast<long>(row_start)] = data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(i) * A_inc1 + A_start1, col_start * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1799  }
1800  }
1801  else
1802  {
1803 #ifdef VIENNACL_WITH_OPENMP
1804  #pragma omp parallel for
1805 #endif
1806  for(long i = static_cast<long>(row_start); i < static_cast<long>(A_size1); i++)
1807  {
1808  data_V[i - static_cast<long>(row_start)] = data_A[viennacl::column_major::mem_index(static_cast<vcl_size_t>(i) * A_inc1 + A_start1, col_start * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1809  }
1810  }
1811  }
1812  else
1813  {
1814  if (A.row_major())
1815  {
1816 #ifdef VIENNACL_WITH_OPENMP
1817  #pragma omp parallel for
1818 #endif
1819  for(long i = static_cast<long>(col_start); i < static_cast<long>(A_size1); i++)
1820  {
1821  data_V[i - static_cast<long>(col_start)] = data_A[viennacl::row_major::mem_index(row_start * A_inc1 + A_start1, static_cast<vcl_size_t>(i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1822  }
1823  }
1824  else
1825  {
1826 #ifdef VIENNACL_WITH_OPENMP
1827  #pragma omp parallel for
1828 #endif
1829  for(long i = static_cast<long>(col_start); i < static_cast<long>(A_size1); i++)
1830  {
1831  data_V[i - static_cast<long>(col_start)] = data_A[viennacl::column_major::mem_index(row_start * A_inc1 + A_start1, static_cast<vcl_size_t>(i) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)];
1832  }
1833  }
1834  }
1835  }
1836 
1843  template<typename NumericT>
1845  vector_base<NumericT>& vec2)
1846  {
1848  vcl_size_t inc1 = viennacl::traits::stride(vec1);
1850 
1852  vcl_size_t inc2 = viennacl::traits::stride(vec2);
1853 
1854  vec2[start2] = vec1[start1];
1855  for(vcl_size_t i = 1; i < size1; i++)
1856  {
1857  vec2[i * inc2 + start2] = vec2[(i - 1) * inc2 + start2] + vec1[i * inc1 + start1];
1858 
1859  }
1860  }
1861 
1868  template<typename NumericT>
1870  vector_base<NumericT>& vec2)
1871  {
1873  vcl_size_t inc1 = viennacl::traits::stride(vec1);
1875 
1877  vcl_size_t inc2 = viennacl::traits::stride(vec2);
1878 
1879 
1880  vec2[start2] = 0;
1881  for(vcl_size_t i = 1; i < size1; i++)
1882  {
1883  vec2[i * inc2 + start2] = vec2[(i - 1) * inc2 + start2] + vec1[(i - 1) * inc1 + start1];
1884 
1885  }
1886  }
1887 
1888 } // namespace host_based
1889 } //namespace linalg
1890 } //namespace viennacl
1891 
1892 
1893 #endif
void fill(MatrixType &matrix, vcl_size_t row_index, vcl_size_t col_index, NumericT value)
Generic filler routine for setting an entry of a matrix to a particular value.
Definition: fill.hpp:46
static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t, vcl_size_t num_cols)
Returns the memory offset for entry (i,j) of a dense matrix.
Definition: forwards.h:313
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
void bidiag_pack_impl(matrix_base< NumericT > &A, vector_base< S1 > &D, vector_base< S1 > &S)
This function stores the diagonal and the superdiagonal of a matrix in two vectors.
void matrix_diag_to_vector(const matrix_base< NumericT > &mat, int k, vector_base< NumericT > &vec)
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
Generic size and resize functionality for different vector and matrix types.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
void matrix_assign(matrix_base< NumericT > &mat, NumericT s, bool clear=false)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
Worker class for decomposing expression templates.
Definition: op_applier.hpp:43
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:340
size_type stride2() const
Returns the number of columns.
Definition: matrix_def.hpp:225
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:57
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
A dense matrix class.
Definition: forwards.h:374
Determines row and column increments for matrices and matrix proxies.
Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initialize...
Definition: matrix_def.hpp:69
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
void house_update_A_right(matrix_base< NumericT > &A, vector_base< NumericT > &D)
This function applies a householder transformation to a matrix: A <- A * P with a householder reflect...
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:91
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper array for accessing a strided submatrix embedded in a larger matrix.
Definition: common.hpp:73
void copy_vec(matrix_base< NumericT > &A, vector_base< S1 > &V, vcl_size_t row_start, vcl_size_t col_start, bool copy_col)
This function copies a row or a column from a matrix to a vector.
void prod_impl(const matrix_base< NumericT > &mat, bool trans, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
void ambm_m(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t, bool reciprocal_beta, bool flip_sign_beta)
void matrix_column(const matrix_base< NumericT > &mat, unsigned int j, vector_base< NumericT > &vec)
void matrix_diagonal_assign(matrix_base< NumericT > &mat, NumericT s)
void element_op(matrix_base< NumericT > &A, matrix_expression< const matrix_base< NumericT >, const matrix_base< NumericT >, op_element_binary< OpT > > const &proxy)
Implementation of the element-wise operations A = B .* C and A = B ./ C (using MATLAB syntax) ...
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void house_update_QL(matrix_base< NumericT > &Q, vector_base< NumericT > &D, vcl_size_t A_size1)
This function updates the matrix Q, which is needed for the computation of the eigenvectors.
size_type stride1() const
Returns the number of rows.
Definition: matrix_def.hpp:223
void ambm(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t, bool reciprocal_beta, bool flip_sign_beta)
std::size_t vcl_size_t
Definition: forwards.h:74
Common routines for single-threaded or OpenMP-enabled execution on CPU.
Proxy classes for vectors.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
void scaled_rank_1_update(matrix_base< NumericT > &mat1, ScalarT const &alpha, vcl_size_t, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_row(const matrix_base< NumericT > &mat, unsigned int i, vector_base< NumericT > &vec)
void am(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t, bool reciprocal_alpha, bool flip_sign_alpha)
void inclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an inclusive scan.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:239
void givens_next(matrix_base< NumericT > &Q, vector_base< NumericT > &tmp1, vector_base< NumericT > &tmp2, int l, int m)
This function updates the matrix Q. It is part of the tql2 algorithm.
void trans(const matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > &proxy, matrix_base< NumericT > &temp_trans)
void bidiag_pack(matrix_base< NumericT > &A, VectorType &dh, VectorType &sh)
A tag class representing transposed matrices.
Definition: forwards.h:219
size_type start2() const
Returns the number of columns.
Definition: matrix_def.hpp:221
void house_update_A_left(matrix_base< NumericT > &A, vector_base< NumericT > &D, vcl_size_t start)
This function applies a householder transformation to a matrix. A <- P * A with a householder reflect...
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:129
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:231
void prod(MatrixAccT1 &A, MatrixAccT2 &B, MatrixAccT3 &C, vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2, NumericT alpha, NumericT beta)
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:229
static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t)
Returns the memory offset for entry (i,j) of a dense matrix.
Definition: forwards.h:330
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT > &mat)
Defines the action of certain unary and binary operators and its arguments (for host execution)...
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:133
Implementation of the ViennaCL scalar class.
A collection of compile time type deductions.
Simple enable-if variant that uses the SFINAE pattern.
size_type start1() const
Returns the number of rows.
Definition: matrix_def.hpp:219