ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2014, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 
19 /*
20 * Benchmark: Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23 
24 //#define VIENNACL_BUILD_INFO
25 #ifndef NDEBUG
26  #define NDEBUG
27 #endif
28 
29 #define VIENNACL_WITH_UBLAS 1
30 
31 #include <boost/numeric/ublas/triangular.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/matrix_sparse.hpp>
35 #include <boost/numeric/ublas/operation_sparse.hpp>
36 #include <boost/numeric/ublas/lu.hpp>
37 
38 
39 #include "viennacl/scalar.hpp"
40 #include "viennacl/vector.hpp"
43 #include "viennacl/ell_matrix.hpp"
44 #include "viennacl/hyb_matrix.hpp"
46 #include "viennacl/linalg/prod.hpp"
49 #include "viennacl/linalg/ilu.hpp"
50 
51 
52 #include <iostream>
53 #include <vector>
54 #include "benchmark-utils.hpp"
55 
56 
57 #define BENCHMARK_RUNS 10
58 
59 
60 template<typename ScalarType>
62 {
63  Timer timer;
64  double exec_time;
65 
66  //ScalarType std_result = 0;
67 
68  ScalarType std_factor1 = ScalarType(3.1415);
69  ScalarType std_factor2 = ScalarType(42.0);
70  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
71  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
72 
73  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
74  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
75 
76  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
77  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
78  {
79  std::cout << "Error reading Matrix file" << std::endl;
80  return 0;
81  }
82  //unsigned int cg_mat_size = cg_mat.size();
83  std::cout << "done reading matrix" << std::endl;
84 
85  ublas_vec1 = boost::numeric::ublas::scalar_vector<ScalarType>(ublas_matrix.size1(), ScalarType(1.0));
86  ublas_vec2 = ublas_vec1;
87 
88  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
89  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
90  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
91 
92  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
93 
94  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
95  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
96  viennacl::sliced_ell_matrix<ScalarType> vcl_sliced_ell_matrix_1;
97 
98  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
99  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
100 
101  //cpu to gpu:
102  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
103  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
104  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
105  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
106  #endif
107  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
108  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
109  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
110  viennacl::copy(ublas_matrix, vcl_sliced_ell_matrix_1);
111  viennacl::copy(ublas_vec1, vcl_vec1);
112  viennacl::copy(ublas_vec2, vcl_vec2);
113 
114 
116 
117  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
118  timer.start();
119  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
120  {
121  //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
122  boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
123  }
124  exec_time = timer.get();
125  std::cout << "CPU time: " << exec_time << std::endl;
126  std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
127  std::cout << ublas_vec1[0] << std::endl;
128 
129 
130  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
131 
132 
133  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
134  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
135  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
136  //std_result = 0.0;
137 
139  timer.start();
140  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
141  {
142  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
143  }
145  exec_time = timer.get();
146  std::cout << "GPU time align1: " << exec_time << std::endl;
147  std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
148  std::cout << vcl_vec1[0] << std::endl;
149 
150  std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
151 
152  viennacl::copy(ublas_vec1, vcl_vec1);
153  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
154  viennacl::copy(ublas_vec1, vcl_vec1);
155  std::cout << "ublas..." << std::endl;
156  timer.start();
157  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
158  std::cout << "Time elapsed: " << timer.get() << std::endl;
159  std::cout << "ViennaCL..." << std::endl;
161  timer.start();
162  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
164  std::cout << "Time elapsed: " << timer.get() << std::endl;
165 
166  ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
167 
169  timer.start();
170  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
171  {
172  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
173  }
175  exec_time = timer.get();
176  std::cout << "GPU time align4: " << exec_time << std::endl;
177  std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
178  std::cout << vcl_vec1[0] << std::endl;
179 
181  timer.start();
182  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
183  {
184  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
185  }
187  exec_time = timer.get();
188  std::cout << "GPU time align8: " << exec_time << std::endl;
189  std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
190  std::cout << vcl_vec1[0] << std::endl;
191 
192 
193  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
194  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
196 
197  viennacl::copy(vcl_vec1, ublas_vec2);
198  long err_cnt = 0;
199  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
200  {
201  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
202  {
203  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
204  ++err_cnt;
205  if (err_cnt > 5)
206  break;
207  }
208  }
209 
211  timer.start();
212  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
213  {
214  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
215  }
217  exec_time = timer.get();
218  std::cout << "GPU time: " << exec_time << std::endl;
219  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
220  std::cout << vcl_vec1[0] << std::endl;
221 
222 
223  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
224  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
226 
227  viennacl::copy(vcl_vec1, ublas_vec2);
228  err_cnt = 0;
229  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
230  {
231  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
232  {
233  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
234  ++err_cnt;
235  if (err_cnt > 5)
236  break;
237  }
238  }
239 
241  timer.start();
242  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
243  {
244  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
245  }
247  exec_time = timer.get();
248  std::cout << "GPU time: " << exec_time << std::endl;
249  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
250  std::cout << vcl_vec1[0] << std::endl;
251 
252 
253  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
254  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
256 
257  viennacl::copy(vcl_vec1, ublas_vec2);
258  err_cnt = 0;
259  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
260  {
261  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
262  {
263  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
264  ++err_cnt;
265  if (err_cnt > 5)
266  break;
267  }
268  }
269 
271  timer.start();
272  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
273  {
274  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
275  }
277  exec_time = timer.get();
278  std::cout << "GPU time: " << exec_time << std::endl;
279  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
280  std::cout << vcl_vec1[0] << std::endl;
281 
282 
283  std::cout << "------- Matrix-Vector product with sliced_ell_matrix ----------" << std::endl;
284  vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2); //startup calculation
286 
287  viennacl::copy(vcl_vec1, ublas_vec2);
288  err_cnt = 0;
289  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
290  {
291  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
292  {
293  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
294  ++err_cnt;
295  if (err_cnt > 5)
296  break;
297  }
298  }
299 
301  timer.start();
302  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
303  {
304  vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2);
305  }
307  exec_time = timer.get();
308  std::cout << "GPU time: " << exec_time << std::endl;
309  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
310  std::cout << vcl_vec1[0] << std::endl;
311 
312  return EXIT_SUCCESS;
313 }
314 
315 
316 int main()
317 {
318  std::cout << std::endl;
319  std::cout << "----------------------------------------------" << std::endl;
320  std::cout << " Device Info" << std::endl;
321  std::cout << "----------------------------------------------" << std::endl;
322 
323 #ifdef VIENNACL_WITH_OPENCL
324  std::cout << viennacl::ocl::current_device().info() << std::endl;
325 #endif
326  std::cout << std::endl;
327  std::cout << "----------------------------------------------" << std::endl;
328  std::cout << "----------------------------------------------" << std::endl;
329  std::cout << "## Benchmark :: Sparse" << std::endl;
330  std::cout << "----------------------------------------------" << std::endl;
331  std::cout << std::endl;
332  std::cout << " -------------------------------" << std::endl;
333  std::cout << " # benchmarking single-precision" << std::endl;
334  std::cout << " -------------------------------" << std::endl;
335  run_benchmark<float>();
336 #ifdef VIENNACL_WITH_OPENCL
338 #endif
339  {
340  std::cout << std::endl;
341  std::cout << " -------------------------------" << std::endl;
342  std::cout << " # benchmarking double-precision" << std::endl;
343  std::cout << " -------------------------------" << std::endl;
344  run_benchmark<double>();
345  }
346  return 0;
347 }
348 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
void inplace_solve(const matrix_base< NumericT > &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
A reader and writer for the matrix market format is implemented here.
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:226
Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations...
int run_benchmark()
Definition: sparse.cpp:61
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
int main()
Definition: sparse.cpp:941
void finish()
Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed.
Definition: memory.hpp:54
void start()
double get() const
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351
void printOps(double num_ops, double exec_time)
Implementation of the coordinate_matrix class.
std::string info(vcl_size_t indent=0, char indent_char= ' ') const
Returns an info string with a few properties of the device. Use full_info() to get all details...
Definition: device.hpp:995
Implementation of the hyb_matrix class.
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:91
#define BENCHMARK_RUNS
Definition: sparse.cpp:57
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
Implementations of incomplete factorization preconditioners. Convenience header file.
void inplace_solve(const matrix_base< NumericT > &A, bool trans_A, matrix_base< NumericT > &B, bool trans_B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
Implementation of the compressed_matrix class.
Implementation of the sliced_ell_matrix class.
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
Implementation of the ell_matrix class.
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
float ScalarType
Definition: fft_1d.cpp:42
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:819
A sparse square matrix in compressed sparse rows format.
long read_matrix_market_file(MatrixT &mat, const char *file, long index_base=1)
Reads a sparse matrix from a file (MatrixMarket format)
Implementation of the ViennaCL scalar class.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...