ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
blas3range.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2014, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 /*
19 *
20 * Tutorial: BLAS level 3 functionality on sub-matrices (blas3range.cpp and blas3range.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23 
24 //disable debug mechanisms to have a fair comparison with ublas:
25 #ifndef NDEBUG
26  #define NDEBUG
27 #endif
28 
29 
30 //
31 // include necessary system headers
32 //
33 #include <iostream>
34 
35 //
36 // ublas includes
37 //
38 #include <boost/numeric/ublas/io.hpp>
39 #include <boost/numeric/ublas/triangular.hpp>
40 #include <boost/numeric/ublas/matrix_sparse.hpp>
41 #include <boost/numeric/ublas/matrix.hpp>
42 #include <boost/numeric/ublas/matrix_proxy.hpp>
43 #include <boost/numeric/ublas/lu.hpp>
44 #include <boost/numeric/ublas/io.hpp>
45 
46 
47 // Must be set if you want to use ViennaCL algorithms on ublas objects
48 #define VIENNACL_WITH_UBLAS 1
49 
50 //
51 // ViennaCL includes
52 //
53 #include "viennacl/scalar.hpp"
54 #include "viennacl/vector.hpp"
55 #include "viennacl/matrix.hpp"
56 #include "viennacl/linalg/prod.hpp"
58 
59 // Some helper functions for this tutorial:
60 #include "Random.hpp"
61 #include "vector-io.hpp"
62 
63 #include "../benchmarks/benchmark-utils.hpp"
64 
65 #define BLAS3_MATRIX_SIZE 1500
66 
67 using namespace boost::numeric;
68 
69 int main()
70 {
71  typedef float ScalarType;
72 
73  Timer timer;
74  double exec_time;
75 
76  //
77  // Set up some ublas objects
78  //
79  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
80  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
81  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
82  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
83  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
84 
85  //
86  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
87  //
88  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
89  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
90  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
91 
92  //
93  // Fill the matrix
94  //
95  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
96  for (unsigned int j = 0; j < ublas_A.size2(); ++j)
97  {
98  ublas_A(i,j) = random<ScalarType>();
99  stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
100  }
101 
102  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
103  for (unsigned int j = 0; j < ublas_B.size2(); ++j)
104  {
105  ublas_B(i,j) = random<ScalarType>();
106  stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
107  }
108 
109  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
110  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
111  ublas::matrix_range< ublas::matrix<ScalarType> > ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
112  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> > ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
113  ublas::matrix_range< ublas::matrix<ScalarType> > ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
114 
115  //
116  // Set up some ViennaCL objects
117  //
118  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag()); //uncomment this is you wish to use GPUs only
119  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
120  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
121  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
122 
123  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
124  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
125  viennacl::matrix_range< viennacl::matrix<ScalarType> > vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
127  viennacl::matrix_range< viennacl::matrix<ScalarType> > vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
128 
129  ublas_C.clear();
130  viennacl::copy(ublas_C, vcl_C);
131 
135 
136  //
137  // Compute reference product using ublas:
138  //
139  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
140  timer.start();
141  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
142  exec_time = timer.get();
143  std::cout << " - Execution time: " << exec_time << std::endl;
144 
145  //std::cout << ublas_C << std::endl;
146 
147  //
148  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
149  //
150  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
151  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
152  for (std::size_t i=0; i<devices.size(); ++i)
153  {
155  std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
156 
157  //viennacl::copy(ublas_A, vcl_A);
158  //viennacl::copy(ublas_B, vcl_B);
159  viennacl::fast_copy(&(stl_A[0]),
160  &(stl_A[0]) + stl_A.size(),
161  vcl_A);
162  viennacl::fast_copy(&(stl_B[0]),
163  &(stl_B[0]) + stl_B.size(),
164  vcl_B);
165  vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
167  timer.start();
168  vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
170  exec_time = timer.get();
171  std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
172  std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
173 
174  //std::cout << vcl_C << std::endl;
175 
176  //
177  // Verify the result
178  //
179  //viennacl::copy(vcl_C, ublas_C1);
180  viennacl::fast_copy(vcl_C, &(stl_C[0]));
181  for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
182  for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
183  ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
184 
185  std::cout << " - Checking result... ";
186  bool check_ok = true;
187  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
188  {
189  for (unsigned int j = 0; j < ublas_A.size2(); ++j)
190  {
191  if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
192  {
193  check_ok = false;
194  break;
195  }
196  }
197  if (!check_ok)
198  break;
199  }
200  if (check_ok)
201  std::cout << "[OK]" << std::endl << std::endl;
202  else
203  std::cout << "[FAILED]" << std::endl << std::endl;
204 
205  }
206 
207  //
208  // That's it.
209  //
210  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
211  return EXIT_SUCCESS;
212 }
213 
void finish() const
Waits until all kernels in the queue have finished their execution.
void switch_device(vcl_size_t i)
Switches the current device to the i-th device in this context.
Definition: context.hpp:118
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
Implementation of the dense matrix class.
viennacl::ocl::context & current_context()
Convenience function for returning the current context.
Definition: backend.hpp:213
void start()
A dense matrix class.
Definition: forwards.h:374
double get() const
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351
basic_range range
Definition: forwards.h:423
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:91
viennacl::ocl::command_queue & get_queue()
Convenience function for getting the default queue for the currently active device in the active cont...
Definition: backend.hpp:320
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:217
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
std::string name() const
Device name string.
Definition: device.hpp:566
size_type size1() const
Returns the number of rows.
Definition: matrix_def.hpp:215
#define BLAS3_MATRIX_SIZE
Definition: blas3range.cpp:65
Proxy classes for matrices.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
A range class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
Definition: forwards.h:423
float ScalarType
Definition: fft_1d.cpp:42
int main()
Definition: blas3range.cpp:69
Class for representing non-strided submatrices of a bigger matrix A.
Definition: forwards.h:439
std::vector< viennacl::ocl::device > const & devices() const
Returns a vector with all devices in this context.
Definition: context.hpp:105
Implementation of the ViennaCL scalar class.
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)