ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
29 #include "viennacl/ocl/device.hpp"
30 #include "viennacl/ocl/handle.hpp"
31 #include "viennacl/ocl/kernel.hpp"
32 #include "viennacl/scalar.hpp"
33 #include "viennacl/tools/tools.hpp"
39 #include "viennacl/traits/size.hpp"
43 
44 namespace viennacl
45 {
46 namespace linalg
47 {
48 namespace opencl
49 {
50 
51 template<typename NumericT>
53  NumericT alpha,
56  vector_base<NumericT> const & Ap,
57  NumericT beta,
58  vector_base<NumericT> & inner_prod_buffer)
59 {
60  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
62 
64  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
65 
66  viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
67 }
68 
69 template<typename NumericT>
71  vector_base<NumericT> const & p,
73  vector_base<NumericT> & inner_prod_buffer)
74 {
75  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
77 
79 
80  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
81  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
82 
83  k.local_work_size(0, 128);
84  k.global_work_size(0, 128*128);
85  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
86  p,
87  Ap,
88  vec_size,
89  inner_prod_buffer,
90  buffer_size_per_vector,
91  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
92  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
93  ));
94 
95 }
96 
97 template<typename NumericT>
99  vector_base<NumericT> const & p,
101  vector_base<NumericT> & inner_prod_buffer)
102 {
103  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
105 
106  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
107  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
108 
109  Ap.clear();
110 
112  unsigned int thread_num = 256; //k.local_work_size(0);
113 
114  k.local_work_size(0, thread_num);
115 
116  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
117 
118  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
119  p,
120  Ap,
121  vec_size,
122  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
123  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
124  inner_prod_buffer,
125  buffer_size_per_vector,
126  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
127  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
128  ));
129 }
130 
131 template<typename NumericT>
133  vector_base<NumericT> const & p,
135  vector_base<NumericT> & inner_prod_buffer)
136 {
137  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
139 
140  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
141  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
142 
144 
145  unsigned int thread_num = 128;
146  unsigned int group_num = 256;
147 
148  k.local_work_size(0, thread_num);
149  k.global_work_size(0, thread_num * group_num);
150 
151  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
152  A.handle().opencl_handle(),
153  cl_uint(A.internal_size1()),
154  cl_uint(A.maxnnz()),
155  cl_uint(A.internal_maxnnz()),
156  viennacl::traits::opencl_handle(p),
157  viennacl::traits::opencl_handle(Ap),
158  vec_size,
159  inner_prod_buffer,
160  buffer_size_per_vector,
161  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
162  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
163  )
164  );
165 }
166 
167 template<typename NumericT>
169  vector_base<NumericT> const & p,
171  vector_base<NumericT> & inner_prod_buffer)
172 {
173  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
175 
176  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
177  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
178 
180 
181  unsigned int thread_num = A.rows_per_block();
182  unsigned int group_num = 256;
183 
184  k.local_work_size(0, thread_num);
185  k.global_work_size(0, thread_num * group_num);
186 
187  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
188  A.handle2().opencl_handle(),
189  A.handle3().opencl_handle(),
190  A.handle().opencl_handle(),
191  viennacl::traits::opencl_handle(p),
192  viennacl::traits::opencl_handle(Ap),
193  vec_size,
194  inner_prod_buffer,
195  buffer_size_per_vector,
196  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
197  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
198  )
199  );
200 }
201 
202 
203 template<typename NumericT>
205  vector_base<NumericT> const & p,
207  vector_base<NumericT> & inner_prod_buffer)
208 {
209  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
211 
212  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
213  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
214 
216 
217  unsigned int thread_num = 256;
218  unsigned int group_num = 128;
219 
220  k.local_work_size(0, thread_num);
221  k.global_work_size(0, thread_num * group_num);
222 
223  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
224  A.handle().opencl_handle(),
225  A.handle3().opencl_handle(),
226  A.handle4().opencl_handle(),
227  A.handle5().opencl_handle(),
228  cl_uint(A.internal_size1()),
229  cl_uint(A.ell_nnz()),
230  cl_uint(A.internal_ellnnz()),
231  viennacl::traits::opencl_handle(p),
232  viennacl::traits::opencl_handle(Ap),
233  vec_size,
234  inner_prod_buffer,
235  buffer_size_per_vector,
236  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
237  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
238  )
239  );
240 }
241 
242 
244 
245 template<typename NumericT>
248  vector_base<NumericT> const & Ap,
249  vector_base<NumericT> & inner_prod_buffer,
250  vcl_size_t buffer_chunk_size,
251  vcl_size_t buffer_chunk_offset)
252 {
253  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
255 
257  cl_uint vec_size = cl_uint(viennacl::traits::size(s));
258 
259  k.local_work_size(0, 128);
260  k.global_work_size(0, 128*128);
261 
262  cl_uint chunk_size = cl_uint(buffer_chunk_size);
263  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
264  viennacl::ocl::enqueue(k(s, r, Ap,
265  inner_prod_buffer, chunk_size, chunk_offset, vec_size,
266  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
267  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
268 }
269 
270 template<typename NumericT>
271 void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
272  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
273  NumericT beta, vector_base<NumericT> const & Ap,
274  vector_base<NumericT> const & r0star,
275  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
276 {
277  (void)buffer_chunk_size;
278 
279  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
281 
283  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
284 
285  k.local_work_size(0, 128);
286  k.global_work_size(0, 128*128);
287  viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
288  residual, As,
289  beta, Ap,
290  r0star,
291  inner_prod_buffer,
292  vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
293  )
294  );
295 }
296 
297 template<typename NumericT>
299  vector_base<NumericT> const & p,
301  vector_base<NumericT> const & r0star,
302  vector_base<NumericT> & inner_prod_buffer,
303  vcl_size_t buffer_chunk_size,
304  vcl_size_t buffer_chunk_offset)
305 {
306  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
308 
310 
311  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
312  cl_uint chunk_size = cl_uint(buffer_chunk_size);
313  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
314 
315  k.local_work_size(0, 128);
316  k.global_work_size(0, 128*128);
317  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
318  p,
319  Ap,
320  r0star,
321  vec_size,
322  inner_prod_buffer, chunk_size, chunk_offset,
323  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
324  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
325  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
326  ));
327 
328 }
329 
330 
331 template<typename NumericT>
333  vector_base<NumericT> const & p,
335  vector_base<NumericT> const & r0star,
336  vector_base<NumericT> & inner_prod_buffer,
337  vcl_size_t buffer_chunk_size,
338  vcl_size_t buffer_chunk_offset)
339 {
340  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
342 
343  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
344  cl_uint chunk_size = cl_uint(buffer_chunk_size);
345  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
346 
347  Ap.clear();
348 
350  unsigned int thread_num = 256; //k.local_work_size(0);
351 
352  k.local_work_size(0, thread_num);
353 
354  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
355 
356  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
357  p,
358  Ap,
359  r0star,
360  vec_size,
361  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
362  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
363  inner_prod_buffer, chunk_size, chunk_offset,
364  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
365  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
366  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
367  ));
368 }
369 
370 template<typename NumericT>
372  vector_base<NumericT> const & p,
374  vector_base<NumericT> const & r0star,
375  vector_base<NumericT> & inner_prod_buffer,
376  vcl_size_t buffer_chunk_size,
377  vcl_size_t buffer_chunk_offset)
378 {
379  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
381 
382  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
383  cl_uint chunk_size = cl_uint(buffer_chunk_size);
384  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
385 
387 
388  unsigned int thread_num = 128;
389  unsigned int group_num = 128;
390 
391  k.local_work_size(0, thread_num);
392  k.global_work_size(0, thread_num * group_num);
393 
394  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
395  A.handle().opencl_handle(),
396  cl_uint(A.internal_size1()),
397  cl_uint(A.maxnnz()),
398  cl_uint(A.internal_maxnnz()),
399  viennacl::traits::opencl_handle(p),
400  viennacl::traits::opencl_handle(Ap),
401  r0star,
402  vec_size,
403  inner_prod_buffer, chunk_size, chunk_offset,
404  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
405  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
406  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
407  )
408  );
409 }
410 
411 template<typename NumericT>
413  vector_base<NumericT> const & p,
415  vector_base<NumericT> const & r0star,
416  vector_base<NumericT> & inner_prod_buffer,
417  vcl_size_t buffer_chunk_size,
418  vcl_size_t buffer_chunk_offset)
419 {
420  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
422 
423  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
424  cl_uint chunk_size = cl_uint(buffer_chunk_size);
425  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
426 
428 
429  unsigned int thread_num = A.rows_per_block();
430  unsigned int group_num = 128;
431 
432  k.local_work_size(0, thread_num);
433  k.global_work_size(0, thread_num * group_num);
434 
435  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
436  A.handle2().opencl_handle(),
437  A.handle3().opencl_handle(),
438  A.handle().opencl_handle(),
439  viennacl::traits::opencl_handle(p),
440  viennacl::traits::opencl_handle(Ap),
441  r0star,
442  vec_size,
443  inner_prod_buffer, chunk_size, chunk_offset,
444  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
445  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
446  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
447  )
448  );
449 }
450 
451 
452 template<typename NumericT>
454  vector_base<NumericT> const & p,
456  vector_base<NumericT> const & r0star,
457  vector_base<NumericT> & inner_prod_buffer,
458  vcl_size_t buffer_chunk_size,
459  vcl_size_t buffer_chunk_offset)
460 {
461  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
463 
464  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
465  cl_uint chunk_size = cl_uint(buffer_chunk_size);
466  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
467 
469 
470  unsigned int thread_num = 256;
471  unsigned int group_num = 128;
472 
473  k.local_work_size(0, thread_num);
474  k.global_work_size(0, thread_num * group_num);
475 
476  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
477  A.handle().opencl_handle(),
478  A.handle3().opencl_handle(),
479  A.handle4().opencl_handle(),
480  A.handle5().opencl_handle(),
481  cl_uint(A.internal_size1()),
482  cl_uint(A.ell_nnz()),
483  cl_uint(A.internal_ellnnz()),
484  viennacl::traits::opencl_handle(p),
485  viennacl::traits::opencl_handle(Ap),
486  r0star,
487  vec_size,
488  inner_prod_buffer, chunk_size, chunk_offset,
489  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
490  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
491  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
492  )
493  );
494 }
495 
497 
505 template <typename T>
507  vector_base<T> const & residual,
508  vector_base<T> & R_buffer,
509  vcl_size_t offset_in_R,
510  vector_base<T> const & inner_prod_buffer,
511  vector_base<T> & r_dot_vk_buffer,
512  vcl_size_t buffer_chunk_size,
513  vcl_size_t buffer_chunk_offset)
514 {
515  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
517 
519 
520  k.local_work_size(0, 128);
521  k.global_work_size(0, 128*128);
522 
523  cl_uint size_vk = cl_uint(v_k.size());
524  cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k));
525  cl_uint R_offset = cl_uint(offset_in_R);
526  cl_uint chunk_size = cl_uint(buffer_chunk_size);
527  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
528  viennacl::ocl::enqueue(k(v_k, vk_offset,
529  residual,
530  R_buffer, R_offset,
531  inner_prod_buffer, chunk_size,
532  r_dot_vk_buffer, chunk_offset,
533  size_vk,
535  ));
536 }
537 
538 template <typename T>
539 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
540  vcl_size_t v_k_size,
541  vcl_size_t v_k_internal_size,
542  vcl_size_t param_k,
543  vector_base<T> & vi_in_vk_buffer,
544  vcl_size_t buffer_chunk_size)
545 {
546  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
548 
550 
551  k.local_work_size(0, 128);
552  k.global_work_size(0, 128*128);
553 
554  cl_uint size_vk = cl_uint(v_k_size);
555  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
556  cl_uint ocl_k = cl_uint(param_k);
557  cl_uint chunk_size = cl_uint(buffer_chunk_size);
558  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
559  vi_in_vk_buffer, chunk_size,
560  viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
561  ));
562 }
563 
564 template <typename T>
566  vcl_size_t v_k_size,
567  vcl_size_t v_k_internal_size,
568  vcl_size_t param_k,
569  vector_base<T> const & vi_in_vk_buffer,
570  vector_base<T> & R_buffer,
571  vcl_size_t krylov_dim,
572  vector_base<T> & inner_prod_buffer,
573  vcl_size_t buffer_chunk_size)
574 {
575  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
577 
579 
580  cl_uint size_vk = cl_uint(v_k_size);
581  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
582  cl_uint ocl_k = cl_uint(param_k);
583  cl_uint chunk_size = cl_uint(buffer_chunk_size);
584  cl_uint ocl_krylov_dim = cl_uint(krylov_dim);
585  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
586  vi_in_vk_buffer, chunk_size,
587  R_buffer, ocl_krylov_dim,
588  inner_prod_buffer,
589  viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
590  ));
591 }
592 
593 template <typename T>
595  vector_base<T> const & residual,
596  vector_base<T> const & krylov_basis,
597  vcl_size_t v_k_size,
598  vcl_size_t v_k_internal_size,
599  vector_base<T> const & coefficients,
600  vcl_size_t param_k)
601 {
602  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
604 
606 
607  cl_uint size_vk = cl_uint(v_k_size);
608  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
609  cl_uint ocl_k = cl_uint(param_k);
610  viennacl::ocl::enqueue(k(result,
611  residual,
612  krylov_basis, size_vk, internal_size_vk,
613  coefficients, ocl_k
614  ));
615 }
616 
617 
618 template <typename T>
620  vector_base<T> const & p,
621  vector_base<T> & Ap,
622  vector_base<T> & inner_prod_buffer)
623 {
624  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
626 
628 
629  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
630  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
631  cl_uint start_p = cl_uint(viennacl::traits::start(p));
632  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
633 
634  k.local_work_size(0, 128);
635  k.global_work_size(0, 128*128);
636  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
637  p, start_p,
638  Ap, start_Ap,
639  vec_size,
640  inner_prod_buffer,
641  buffer_size_per_vector,
642  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
644  ));
645 
646 }
647 
648 template <typename T>
650  vector_base<T> const & p,
651  vector_base<T> & Ap,
652  vector_base<T> & inner_prod_buffer)
653 {
654  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
656 
657  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
658  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
659  cl_uint start_p = cl_uint(viennacl::traits::start(p));
660  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
661 
662  Ap.clear();
663  inner_prod_buffer.clear();
664 
666  unsigned int thread_num = 128; //k.local_work_size(0);
667 
668  k.local_work_size(0, thread_num);
669 
670  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
671 
672  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
673  p, start_p,
674  Ap, start_Ap,
675  vec_size,
676  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
677  viennacl::ocl::local_mem(sizeof(T)*thread_num),
678  inner_prod_buffer,
679  buffer_size_per_vector,
680  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
682  ));
683 }
684 
685 template <typename T>
687  vector_base<T> const & p,
688  vector_base<T> & Ap,
689  vector_base<T> & inner_prod_buffer)
690 {
691  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
693 
694  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
695  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
696  cl_uint start_p = cl_uint(viennacl::traits::start(p));
697  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
698 
700 
701  unsigned int thread_num = 128;
702  unsigned int group_num = 128;
703 
704  k.local_work_size(0, thread_num);
705  k.global_work_size(0, thread_num * group_num);
706 
707  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
708  A.handle().opencl_handle(),
709  cl_uint(A.internal_size1()),
710  cl_uint(A.maxnnz()),
711  cl_uint(A.internal_maxnnz()),
712  viennacl::traits::opencl_handle(p), start_p,
713  viennacl::traits::opencl_handle(Ap), start_Ap,
714  vec_size,
715  inner_prod_buffer,
716  buffer_size_per_vector,
717  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
719  )
720  );
721 }
722 
723 template <typename T>
725  vector_base<T> const & p,
726  vector_base<T> & Ap,
727  vector_base<T> & inner_prod_buffer)
728 {
729  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
731 
732  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
733  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
734  cl_uint start_p = cl_uint(viennacl::traits::start(p));
735  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
736 
738 
739  unsigned int thread_num = A.rows_per_block();
740  unsigned int group_num = 128;
741 
742  k.local_work_size(0, thread_num);
743  k.global_work_size(0, thread_num * group_num);
744 
745  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
746  A.handle2().opencl_handle(),
747  A.handle3().opencl_handle(),
748  A.handle().opencl_handle(),
749  viennacl::traits::opencl_handle(p), start_p,
750  viennacl::traits::opencl_handle(Ap), start_Ap,
751  vec_size,
752  inner_prod_buffer,
753  buffer_size_per_vector,
754  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
756  )
757  );
758 }
759 
760 
761 template <typename T>
763  vector_base<T> const & p,
764  vector_base<T> & Ap,
765  vector_base<T> & inner_prod_buffer)
766 {
767  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
769 
770  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
771  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
772  cl_uint start_p = cl_uint(viennacl::traits::start(p));
773  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
774 
776 
777  unsigned int thread_num = 128;
778  unsigned int group_num = 128;
779 
780  k.local_work_size(0, thread_num);
781  k.global_work_size(0, thread_num * group_num);
782 
783  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
784  A.handle().opencl_handle(),
785  A.handle3().opencl_handle(),
786  A.handle4().opencl_handle(),
787  A.handle5().opencl_handle(),
788  cl_uint(A.internal_size1()),
789  cl_uint(A.ell_nnz()),
790  cl_uint(A.internal_ellnnz()),
791  viennacl::traits::opencl_handle(p), start_p,
792  viennacl::traits::opencl_handle(Ap), start_Ap,
793  vec_size,
794  inner_prod_buffer,
795  buffer_size_per_vector,
796  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
798  )
799  );
800 }
801 
802 
803 } //namespace opencl
804 } //namespace linalg
805 } //namespace viennacl
806 
807 
808 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1345
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Generic size and resize functionality for different vector and matrix types.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1352
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
vcl_size_t rows_per_block() const
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
OpenCL kernel file for specialized iterative solver kernels.
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:607
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
Implementation of a smart-pointer-like class for handling OpenCL handles.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:74
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Forward declarations of the implicit_vector_base, vector_base class.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Simple enable-if variant that uses the SFINAE pattern.
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...