ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
31 namespace viennacl
32 {
33 namespace linalg
34 {
35 namespace cuda
36 {
37 
38 //
39 // CG vector update:
40 //
41 
42 // cpu scalar
43 template<typename NumericT>
44 __global__ void pipelined_cg_vector_kernel(NumericT * result,
45  NumericT alpha,
46  NumericT * p,
47  NumericT * r,
48  NumericT const * Ap,
49  NumericT beta,
50  NumericT * inner_prod_buffer,
51  unsigned int size)
52 {
53  NumericT inner_prod_contrib = 0;
54  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
55  {
56  NumericT value_p = p[i];
57  NumericT value_r = r[i];
58 
59  result[i] += alpha * value_p;
60  value_r -= alpha * Ap[i];
61  value_p = value_r + beta * value_p;
62 
63  p[i] = value_p;
64  r[i] = value_r;
65  inner_prod_contrib += value_r * value_r;
66  }
67 
68  // parallel reduction in work group
69  __shared__ NumericT shared_array[256];
70  shared_array[threadIdx.x] = inner_prod_contrib;
71  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
72  {
73  __syncthreads();
74  if (threadIdx.x < stride)
75  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
76  }
77 
78  // write results to result array
79  if (threadIdx.x == 0)
80  inner_prod_buffer[blockIdx.x] = shared_array[0];
81 }
82 
83 
84 template<typename NumericT>
86  NumericT alpha,
89  vector_base<NumericT> const & Ap,
90  NumericT beta,
91  vector_base<NumericT> & inner_prod_buffer)
92 {
93  unsigned int size = result.size();
94  pipelined_cg_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
95  alpha,
96  detail::cuda_arg<NumericT>(p),
97  detail::cuda_arg<NumericT>(r),
98  detail::cuda_arg<NumericT>(Ap),
99  beta,
100  detail::cuda_arg<NumericT>(inner_prod_buffer),
101  size);
102  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
103 }
104 
105 
106 
107 
108 //
109 // Compressed matrix
110 //
111 
112 
113 template<typename NumericT>
115  const unsigned int * row_indices,
116  const unsigned int * column_indices,
117  const NumericT * elements,
118  const NumericT * p,
119  NumericT * Ap,
120  unsigned int size,
121  NumericT * inner_prod_buffer,
122  unsigned int buffer_size)
123 {
124  NumericT inner_prod_ApAp = 0;
125  NumericT inner_prod_pAp = 0;
126 
127  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
128  row < size;
129  row += gridDim.x * blockDim.x)
130  {
131  NumericT dot_prod = NumericT(0);
132  unsigned int row_end = row_indices[row+1];
133  for (unsigned int i = row_indices[row]; i < row_end; ++i)
134  dot_prod += elements[i] * p[column_indices[i]];
135  Ap[row] = dot_prod;
136  inner_prod_ApAp += dot_prod * dot_prod;
137  inner_prod_pAp += p[row] * dot_prod;
138  }
139 
141  __shared__ NumericT shared_array_ApAp[256];
142  __shared__ NumericT shared_array_pAp[256];
143  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
144  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
145  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
146  {
147  __syncthreads();
148  if (threadIdx.x < stride)
149  {
150  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
151  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
152  }
153  }
154 
155  // write results to result array
156  if (threadIdx.x == 0) {
157  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
158  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
159  }
160 }
161 
162 
163 
164 
165 template<typename NumericT>
167  vector_base<NumericT> const & p,
169  vector_base<NumericT> & inner_prod_buffer)
170 {
171  unsigned int size = p.size();
172  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
173 
174  pipelined_cg_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
175  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
176  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
177  detail::cuda_arg<NumericT>(p),
178  detail::cuda_arg<NumericT>(Ap),
179  size,
180  detail::cuda_arg<NumericT>(inner_prod_buffer),
181  buffer_size_per_vector);
182  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
183 }
184 
185 
186 //
187 // Coordinate Matrix
188 //
189 
190 
191 template<typename NumericT>
192 __global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
193  const NumericT * elements,
194  const unsigned int * group_boundaries,
195  const NumericT * p,
196  NumericT * Ap,
197  unsigned int size,
198  NumericT * inner_prod_buffer,
199  unsigned int buffer_size)
200 {
201  NumericT inner_prod_ApAp = 0;
202  NumericT inner_prod_pAp = 0;
203  __shared__ unsigned int shared_rows[128];
204  __shared__ NumericT inter_results[128];
205 
206  uint2 tmp;
207  NumericT val;
208  unsigned int group_start = group_boundaries[blockIdx.x];
209  unsigned int group_end = group_boundaries[blockIdx.x + 1];
210  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
211 
212  unsigned int local_index = 0;
213 
214  for (unsigned int k = 0; k < k_end; ++k)
215  {
216  local_index = group_start + k * blockDim.x + threadIdx.x;
217 
218  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
219  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
220 
221  //check for carry from previous loop run:
222  if (threadIdx.x == 0 && k > 0)
223  {
224  if (tmp.x == shared_rows[blockDim.x-1])
225  val += inter_results[blockDim.x-1];
226  else
227  {
228  NumericT Ap_entry = inter_results[blockDim.x-1];
229  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
230  inner_prod_ApAp += Ap_entry * Ap_entry;
231  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
232  }
233  }
234 
235  //segmented parallel reduction begin
236  __syncthreads();
237  shared_rows[threadIdx.x] = tmp.x;
238  inter_results[threadIdx.x] = val;
239  NumericT left = 0;
240  __syncthreads();
241 
242  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
243  {
244  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
245  __syncthreads();
246  inter_results[threadIdx.x] += left;
247  __syncthreads();
248  }
249  //segmented parallel reduction end
250 
251  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
252  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
253  {
254  NumericT Ap_entry = inter_results[threadIdx.x];
255  Ap[tmp.x] = Ap_entry;
256  inner_prod_ApAp += Ap_entry * Ap_entry;
257  inner_prod_pAp += Ap_entry * p[tmp.x];
258  }
259 
260  __syncthreads();
261  } //for k
262 
263  if (local_index + 1 == group_end)
264  {
265  NumericT Ap_entry = inter_results[threadIdx.x];
266  Ap[tmp.x] = Ap_entry;
267  inner_prod_ApAp += Ap_entry * Ap_entry;
268  inner_prod_pAp += Ap_entry * p[tmp.x];
269  }
270 
272  __shared__ NumericT shared_array_ApAp[256];
273  __shared__ NumericT shared_array_pAp[256];
274  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
275  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
276  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
277  {
278  __syncthreads();
279  if (threadIdx.x < stride)
280  {
281  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
282  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
283  }
284  }
285 
286  // write results to result array
287  if (threadIdx.x == 0) {
288  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
289  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
290  }
291 
292 }
293 
294 
295 template<typename NumericT>
297  vector_base<NumericT> const & p,
299  vector_base<NumericT> & inner_prod_buffer)
300 {
301  unsigned int size = p.size();
302  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
303 
304  Ap.clear();
305 
306  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
307  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
308  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
309  detail::cuda_arg<NumericT>(p),
310  detail::cuda_arg<NumericT>(Ap),
311  size,
312  detail::cuda_arg<NumericT>(inner_prod_buffer),
313  buffer_size_per_vector);
314  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
315 }
316 
317 
318 
319 //
320 // ELL Matrix
321 //
322 
323 template<typename NumericT>
324 __global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
325  const NumericT * elements,
326  unsigned int internal_row_num,
327  unsigned int items_per_row,
328  const NumericT * p,
329  NumericT * Ap,
330  unsigned int size,
331  NumericT * inner_prod_buffer,
332  unsigned int buffer_size)
333 {
334  NumericT inner_prod_ApAp = 0;
335  NumericT inner_prod_pAp = 0;
336  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
337  unsigned int glb_sz = gridDim.x * blockDim.x;
338 
339  for (unsigned int row = glb_id; row < size; row += glb_sz)
340  {
341  NumericT sum = 0;
342 
343  unsigned int offset = row;
344  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
345  {
346  NumericT val = elements[offset];
347  sum += val ? p[coords[offset]] * val : NumericT(0);
348  }
349 
350  Ap[row] = sum;
351  inner_prod_ApAp += sum * sum;
352  inner_prod_pAp += sum * p[row];
353  }
354 
356  __shared__ NumericT shared_array_ApAp[256];
357  __shared__ NumericT shared_array_pAp[256];
358  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
359  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
360  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
361  {
362  __syncthreads();
363  if (threadIdx.x < stride)
364  {
365  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
366  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
367  }
368  }
369 
370  // write results to result array
371  if (threadIdx.x == 0) {
372  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
373  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
374  }
375 }
376 
377 
378 template<typename NumericT>
380  vector_base<NumericT> const & p,
382  vector_base<NumericT> & inner_prod_buffer)
383 {
384  unsigned int size = p.size();
385  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
386 
387  pipelined_cg_ell_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
388  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
389  static_cast<unsigned int>(A.internal_size1()),
390  static_cast<unsigned int>(A.maxnnz()),
391  detail::cuda_arg<NumericT>(p),
392  detail::cuda_arg<NumericT>(Ap),
393  size,
394  detail::cuda_arg<NumericT>(inner_prod_buffer),
395  buffer_size_per_vector);
396  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
397 }
398 
399 
400 //
401 // SELL-C-\sigma Matrix
402 //
403 
404 template<typename NumericT>
405 __global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
406  const unsigned int * column_indices,
407  const unsigned int * block_start,
408  const NumericT * elements,
409  const NumericT * p,
410  NumericT * Ap,
411  unsigned int size,
412  NumericT * inner_prod_buffer,
413  unsigned int buffer_size)
414 {
415  NumericT inner_prod_ApAp = 0;
416  NumericT inner_prod_pAp = 0;
417  unsigned int local_id = threadIdx.x;
418  unsigned int local_size = blockDim.x;
419 
420  for (unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
421  {
422  unsigned int row = block_idx * local_size + local_id;
423  unsigned int offset = block_start[block_idx];
424  unsigned int num_columns = columns_per_block[block_idx];
425 
426  NumericT sum = 0;
427  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
428  {
429  unsigned int index = offset + item_id * local_size + local_id;
430  NumericT val = elements[index];
431 
432  sum += val ? (p[column_indices[index]] * val) : 0;
433  }
434 
435  if (row < size)
436  {
437  Ap[row] = sum;
438  inner_prod_ApAp += sum * sum;
439  inner_prod_pAp += sum * p[row];
440  }
441  }
442 
444  __shared__ NumericT shared_array_ApAp[256];
445  __shared__ NumericT shared_array_pAp[256];
446  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
447  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
448  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
449  {
450  __syncthreads();
451  if (threadIdx.x < stride)
452  {
453  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
454  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
455  }
456  }
457 
458  // write results to result array
459  if (threadIdx.x == 0) {
460  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
461  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
462  }
463 }
464 
465 template<typename NumericT>
467  vector_base<NumericT> const & p,
469  vector_base<NumericT> & inner_prod_buffer)
470 {
471  unsigned int size = p.size();
472  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
473 
474  pipelined_cg_sliced_ell_vec_mul_kernel<<<128, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
475  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
476  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
477  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
478  detail::cuda_arg<NumericT>(p),
479  detail::cuda_arg<NumericT>(Ap),
480  size,
481  detail::cuda_arg<NumericT>(inner_prod_buffer),
482  buffer_size_per_vector);
483  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
484 }
485 
486 
487 //
488 // Hybrid Matrix
489 //
490 
491 
492 template<typename NumericT>
493 __global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords,
494  const NumericT * ell_elements,
495  const unsigned int * csr_rows,
496  const unsigned int * csr_cols,
497  const NumericT * csr_elements,
498  unsigned int internal_row_num,
499  unsigned int items_per_row,
500  const NumericT * p,
501  NumericT * Ap,
502  unsigned int size,
503  NumericT * inner_prod_buffer,
504  unsigned int buffer_size)
505 {
506  NumericT inner_prod_ApAp = 0;
507  NumericT inner_prod_pAp = 0;
508  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
509  unsigned int glb_sz = gridDim.x * blockDim.x;
510 
511  for (unsigned int row = glb_id; row < size; row += glb_sz)
512  {
513  NumericT sum = 0;
514 
515  unsigned int offset = row;
516  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
517  {
518  NumericT val = ell_elements[offset];
519 
520  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
521  }
522 
523  unsigned int col_begin = csr_rows[row];
524  unsigned int col_end = csr_rows[row + 1];
525 
526  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
527  {
528  sum += p[csr_cols[item_id]] * csr_elements[item_id];
529  }
530 
531  Ap[row] = sum;
532  inner_prod_ApAp += sum * sum;
533  inner_prod_pAp += sum * p[row];
534  }
535 
537  __shared__ NumericT shared_array_ApAp[256];
538  __shared__ NumericT shared_array_pAp[256];
539  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
540  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
541  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
542  {
543  __syncthreads();
544  if (threadIdx.x < stride)
545  {
546  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
547  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
548  }
549  }
550 
551  // write results to result array
552  if (threadIdx.x == 0) {
553  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
554  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
555  }
556 }
557 
558 
559 
560 template<typename NumericT>
562  vector_base<NumericT> const & p,
564  vector_base<NumericT> & inner_prod_buffer)
565 {
566  unsigned int size = p.size();
567  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
568 
569  pipelined_cg_hyb_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
570  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
571  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
572  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
573  detail::cuda_arg<NumericT>(A.handle5().cuda_handle()),
574  static_cast<unsigned int>(A.internal_size1()),
575  static_cast<unsigned int>(A.ell_nnz()),
576  detail::cuda_arg<NumericT>(p),
577  detail::cuda_arg<NumericT>(Ap),
578  size,
579  detail::cuda_arg<NumericT>(inner_prod_buffer),
580  buffer_size_per_vector);
581  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
582 }
583 
584 
585 
587 
588 template<typename NumericT>
589 __global__ void pipelined_bicgstab_update_s_kernel(NumericT * s,
590  NumericT const * residual,
591  NumericT const * Ap,
592  unsigned int size,
593  NumericT * inner_prod_buffer,
594  unsigned int chunk_size,
595  unsigned int chunk_offset)
596 {
597  NumericT alpha = 0;
598 
599  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
600  __shared__ NumericT shared_array[256];
601  __shared__ NumericT shared_array_Ap_in_r0[256];
602 
603  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
604  shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
605  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
606  {
607  __syncthreads();
608  if (threadIdx.x < stride) {
609  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
610  shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride];
611  }
612  }
613 
614  // compute alpha from reduced values:
615  __syncthreads();
616  alpha = shared_array[0] / shared_array_Ap_in_r0[0];
617 
618  // run vector update and compute first stage of <s, s>
619  NumericT inner_prod_contrib = 0;
620  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
621  {
622  NumericT value_s = s[i];
623 
624  value_s = residual[i] - alpha * Ap[i];
625  inner_prod_contrib += value_s * value_s;
626 
627  s[i] = value_s;
628  }
629  __syncthreads();
630 
631  // parallel reduction in work group
632  shared_array[threadIdx.x] = inner_prod_contrib;
633  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
634  {
635  __syncthreads();
636  if (threadIdx.x < stride)
637  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
638  }
639 
640  // write results to inner_prod_buffer
641  if (threadIdx.x == 0)
642  inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
643 }
644 
645 template<typename NumericT>
648  vector_base<NumericT> const & Ap,
649  vector_base<NumericT> & inner_prod_buffer,
650  vcl_size_t buffer_chunk_size,
651  vcl_size_t buffer_chunk_offset)
652 {
653  unsigned int size = static_cast<unsigned int>(s.size());
654  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
655  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
656 
657  pipelined_bicgstab_update_s_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(s),
658  detail::cuda_arg<NumericT>(r),
659  detail::cuda_arg<NumericT>(Ap),
660  size,
661  detail::cuda_arg<NumericT>(inner_prod_buffer),
662  chunk_size,
663  chunk_offset);
664  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
665 }
666 
667 template<typename NumericT>
668 __global__ void pipelined_bicgstab_vector_kernel(NumericT * result,
669  NumericT alpha,
670  NumericT * p,
671  NumericT omega,
672  NumericT const * s,
673  NumericT * residual,
674  NumericT const * As,
675  NumericT beta,
676  NumericT const * Ap,
677  NumericT const * r0star,
678  NumericT * inner_prod_buffer,
679  unsigned int size)
680 {
681  NumericT inner_prod_r_r0star = 0;
682  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
683  {
684  NumericT value_result = result[i];
685  NumericT value_p = p[i];
686  NumericT value_s = s[i];
687  NumericT value_residual = residual[i];
688  NumericT value_As = As[i];
689  NumericT value_Ap = Ap[i];
690  NumericT value_r0star = r0star[i];
691 
692  value_result += alpha * value_p + omega * value_s;
693  value_residual = value_s - omega * value_As;
694  value_p = value_residual + beta * (value_p - omega * value_Ap);
695 
696  result[i] = value_result;
697  residual[i] = value_residual;
698  p[i] = value_p;
699  inner_prod_r_r0star += value_residual * value_r0star;
700  }
701 
702  // parallel reduction in work group
703  __shared__ NumericT shared_array[256];
704  shared_array[threadIdx.x] = inner_prod_r_r0star;
705  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
706  {
707  __syncthreads();
708  if (threadIdx.x < stride)
709  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
710  }
711 
712  // write results to result array
713  if (threadIdx.x == 0)
714  inner_prod_buffer[blockIdx.x] = shared_array[0];
715 }
716 
717 
718 template<typename NumericT>
719 void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
720  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
721  NumericT beta, vector_base<NumericT> const & Ap,
722  vector_base<NumericT> const & r0star,
723  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
724 {
725  (void)buffer_chunk_size;
726  unsigned int size = static_cast<unsigned int>(result.size());
727 
728  pipelined_bicgstab_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
729  alpha,
730  detail::cuda_arg<NumericT>(p),
731  omega,
732  detail::cuda_arg<NumericT>(s),
733  detail::cuda_arg<NumericT>(residual),
734  detail::cuda_arg<NumericT>(As),
735  beta,
736  detail::cuda_arg<NumericT>(Ap),
737  detail::cuda_arg<NumericT>(r0star),
738  detail::cuda_arg<NumericT>(inner_prod_buffer),
739  size);
740  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
741 }
742 
743 
744 
745 //
746 // Compressed matrix
747 //
748 
749 
750 template<typename NumericT>
752  const unsigned int * row_indices,
753  const unsigned int * column_indices,
754  const NumericT * elements,
755  const NumericT * p,
756  NumericT * Ap,
757  const NumericT * r0star,
758  unsigned int size,
759  NumericT * inner_prod_buffer,
760  unsigned int buffer_size,
761  unsigned int buffer_offset)
762 {
763  NumericT inner_prod_ApAp = 0;
764  NumericT inner_prod_pAp = 0;
765  NumericT inner_prod_r0Ap = 0;
766 
767  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
768  row < size;
769  row += gridDim.x * blockDim.x)
770  {
771  NumericT dot_prod(0);
772  unsigned int row_end = row_indices[row+1];
773  for (unsigned int i = row_indices[row]; i < row_end; ++i)
774  dot_prod += elements[i] * p[column_indices[i]];
775  Ap[row] = dot_prod;
776  inner_prod_ApAp += dot_prod * dot_prod;
777  inner_prod_pAp += p[row] * dot_prod;
778  inner_prod_r0Ap += r0star[row] * dot_prod;
779  }
780 
782  __shared__ NumericT shared_array_ApAp[256];
783  __shared__ NumericT shared_array_pAp[256];
784  __shared__ NumericT shared_array_r0Ap[256];
785  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
786  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
787  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
788  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
789  {
790  __syncthreads();
791  if (threadIdx.x < stride)
792  {
793  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
794  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
795  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
796  }
797  }
798 
799  // write results to result array
800  if (threadIdx.x == 0) {
801  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
802  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
803  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
804  }
805 }
806 
807 
808 
809 
810 template<typename NumericT>
812  vector_base<NumericT> const & p,
814  vector_base<NumericT> const & r0star,
815  vector_base<NumericT> & inner_prod_buffer,
816  vcl_size_t buffer_chunk_size,
817  vcl_size_t buffer_chunk_offset)
818 {
819  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
820  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
821  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
822 
823  pipelined_bicgstab_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
824  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
825  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
826  detail::cuda_arg<NumericT>(p),
827  detail::cuda_arg<NumericT>(Ap),
828  detail::cuda_arg<NumericT>(r0star),
829  vec_size,
830  detail::cuda_arg<NumericT>(inner_prod_buffer),
831  chunk_size,
832  chunk_offset);
833  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_kernel");
834 }
835 
836 
837 //
838 // Coordinate Matrix
839 //
840 
841 
842 template<typename NumericT>
843 __global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
844  const NumericT * elements,
845  const unsigned int * group_boundaries,
846  const NumericT * p,
847  NumericT * Ap,
848  const NumericT * r0star,
849  unsigned int size,
850  NumericT * inner_prod_buffer,
851  unsigned int buffer_size,
852  unsigned int buffer_offset)
853 {
854  NumericT inner_prod_ApAp = 0;
855  NumericT inner_prod_pAp = 0;
856  NumericT inner_prod_r0Ap = 0;
857  __shared__ unsigned int shared_rows[128];
858  __shared__ NumericT inter_results[128];
859 
860  uint2 tmp;
861  NumericT val;
862  unsigned int group_start = group_boundaries[blockIdx.x];
863  unsigned int group_end = group_boundaries[blockIdx.x + 1];
864  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
865 
866  unsigned int local_index = 0;
867 
868  for (unsigned int k = 0; k < k_end; ++k)
869  {
870  local_index = group_start + k * blockDim.x + threadIdx.x;
871 
872  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
873  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
874 
875  //check for carry from previous loop run:
876  if (threadIdx.x == 0 && k > 0)
877  {
878  if (tmp.x == shared_rows[blockDim.x-1])
879  val += inter_results[blockDim.x-1];
880  else
881  {
882  NumericT Ap_entry = inter_results[blockDim.x-1];
883  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
884  inner_prod_ApAp += Ap_entry * Ap_entry;
885  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
886  inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
887  }
888  }
889 
890  //segmented parallel reduction begin
891  __syncthreads();
892  shared_rows[threadIdx.x] = tmp.x;
893  inter_results[threadIdx.x] = val;
894  NumericT left = 0;
895  __syncthreads();
896 
897  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
898  {
899  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
900  __syncthreads();
901  inter_results[threadIdx.x] += left;
902  __syncthreads();
903  }
904  //segmented parallel reduction end
905 
906  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
907  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
908  {
909  NumericT Ap_entry = inter_results[threadIdx.x];
910  Ap[tmp.x] = Ap_entry;
911  inner_prod_ApAp += Ap_entry * Ap_entry;
912  inner_prod_pAp += Ap_entry * p[tmp.x];
913  inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
914  }
915 
916  __syncthreads();
917  } //for k
918 
919  if (local_index + 1 == group_end)
920  {
921  NumericT Ap_entry = inter_results[threadIdx.x];
922  Ap[tmp.x] = Ap_entry;
923  inner_prod_ApAp += Ap_entry * Ap_entry;
924  inner_prod_pAp += Ap_entry * p[tmp.x];
925  inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
926  }
927 
929  __shared__ NumericT shared_array_ApAp[256];
930  __shared__ NumericT shared_array_pAp[256];
931  __shared__ NumericT shared_array_r0Ap[256];
932  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
933  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
934  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
935  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
936  {
937  __syncthreads();
938  if (threadIdx.x < stride)
939  {
940  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
941  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
942  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
943  }
944  }
945 
946  // write results to result array
947  if (threadIdx.x == 0) {
948  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
949  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
950  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
951  }
952 
953 }
954 
955 
956 template<typename NumericT>
958  vector_base<NumericT> const & p,
960  vector_base<NumericT> const & r0star,
961  vector_base<NumericT> & inner_prod_buffer,
962  vcl_size_t buffer_chunk_size,
963  vcl_size_t buffer_chunk_offset)
964 {
965  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
966  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
967  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
968 
969  Ap.clear();
970 
971  pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
972  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
973  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
974  detail::cuda_arg<NumericT>(p),
975  detail::cuda_arg<NumericT>(Ap),
976  detail::cuda_arg<NumericT>(r0star),
977  vec_size,
978  detail::cuda_arg<NumericT>(inner_prod_buffer),
979  chunk_size,
980  chunk_offset);
981  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
982 }
983 
984 
985 
986 //
987 // ELL Matrix
988 //
989 
990 template<typename NumericT>
991 __global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords,
992  const NumericT * elements,
993  unsigned int internal_row_num,
994  unsigned int items_per_row,
995  const NumericT * p,
996  NumericT * Ap,
997  const NumericT * r0star,
998  unsigned int size,
999  NumericT * inner_prod_buffer,
1000  unsigned int buffer_size,
1001  unsigned int buffer_offset)
1002 {
1003  NumericT inner_prod_ApAp = 0;
1004  NumericT inner_prod_pAp = 0;
1005  NumericT inner_prod_r0Ap = 0;
1006  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1007  unsigned int glb_sz = gridDim.x * blockDim.x;
1008 
1009  for (unsigned int row = glb_id; row < size; row += glb_sz)
1010  {
1011  NumericT sum = 0;
1012 
1013  unsigned int offset = row;
1014  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1015  {
1016  NumericT val = elements[offset];
1017  sum += val ? p[coords[offset]] * val : NumericT(0);
1018  }
1019 
1020  Ap[row] = sum;
1021  inner_prod_ApAp += sum * sum;
1022  inner_prod_pAp += sum * p[row];
1023  inner_prod_r0Ap += sum * r0star[row];
1024  }
1025 
1027  __shared__ NumericT shared_array_ApAp[256];
1028  __shared__ NumericT shared_array_pAp[256];
1029  __shared__ NumericT shared_array_r0Ap[256];
1030  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1031  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1032  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1033  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1034  {
1035  __syncthreads();
1036  if (threadIdx.x < stride)
1037  {
1038  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1039  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1040  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1041  }
1042  }
1043 
1044  // write results to result array
1045  if (threadIdx.x == 0) {
1046  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1047  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1048  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1049  }
1050 }
1051 
1052 
1053 template<typename NumericT>
1055  vector_base<NumericT> const & p,
1056  vector_base<NumericT> & Ap,
1057  vector_base<NumericT> const & r0star,
1058  vector_base<NumericT> & inner_prod_buffer,
1059  vcl_size_t buffer_chunk_size,
1060  vcl_size_t buffer_chunk_offset)
1061 {
1062  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1063  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1064  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1065 
1066  pipelined_bicgstab_ell_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1067  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1068  static_cast<unsigned int>(A.internal_size1()),
1069  static_cast<unsigned int>(A.maxnnz()),
1070  detail::cuda_arg<NumericT>(p),
1071  detail::cuda_arg<NumericT>(Ap),
1072  detail::cuda_arg<NumericT>(r0star),
1073  vec_size,
1074  detail::cuda_arg<NumericT>(inner_prod_buffer),
1075  chunk_size,
1076  chunk_offset);
1077  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
1078 }
1079 
1080 
1081 //
1082 // SELL-C-\sigma Matrix
1083 //
1084 
1085 template<typename NumericT>
1086 __global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
1087  const unsigned int * column_indices,
1088  const unsigned int * block_start,
1089  const NumericT * elements,
1090  const NumericT * p,
1091  NumericT * Ap,
1092  const NumericT * r0star,
1093  unsigned int size,
1094  NumericT * inner_prod_buffer,
1095  unsigned int buffer_size,
1096  unsigned int buffer_offset)
1097 {
1098  NumericT inner_prod_ApAp = 0;
1099  NumericT inner_prod_pAp = 0;
1100  NumericT inner_prod_r0Ap = 0;
1101  unsigned int local_id = threadIdx.x;
1102  unsigned int local_size = blockDim.x;
1103 
1104  for (unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
1105  {
1106  unsigned int row = block_idx * local_size + local_id;
1107  unsigned int offset = block_start[block_idx];
1108  unsigned int num_columns = columns_per_block[block_idx];
1109 
1110  NumericT sum = 0;
1111  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
1112  {
1113  unsigned int index = offset + item_id * local_size + local_id;
1114  NumericT val = elements[index];
1115 
1116  sum += val ? (p[column_indices[index]] * val) : 0;
1117  }
1118 
1119  if (row < size)
1120  {
1121  Ap[row] = sum;
1122  inner_prod_ApAp += sum * sum;
1123  inner_prod_pAp += sum * p[row];
1124  inner_prod_r0Ap += sum * r0star[row];
1125  }
1126  }
1127 
1129  __shared__ NumericT shared_array_ApAp[256];
1130  __shared__ NumericT shared_array_pAp[256];
1131  __shared__ NumericT shared_array_r0Ap[256];
1132  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1133  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1134  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1135  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1136  {
1137  __syncthreads();
1138  if (threadIdx.x < stride)
1139  {
1140  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1141  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1142  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1143  }
1144  }
1145 
1146  // write results to result array
1147  if (threadIdx.x == 0) {
1148  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1149  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1150  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1151  }
1152 }
1153 
1154 template<typename NumericT>
1156  vector_base<NumericT> const & p,
1157  vector_base<NumericT> & Ap,
1158  vector_base<NumericT> const & r0star,
1159  vector_base<NumericT> & inner_prod_buffer,
1160  vcl_size_t buffer_chunk_size,
1161  vcl_size_t buffer_chunk_offset)
1162 {
1163  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1164  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1165  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1166 
1167  pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<128, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1168  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1169  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1170  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1171  detail::cuda_arg<NumericT>(p),
1172  detail::cuda_arg<NumericT>(Ap),
1173  detail::cuda_arg<NumericT>(r0star),
1174  vec_size,
1175  detail::cuda_arg<NumericT>(inner_prod_buffer),
1176  chunk_size,
1177  chunk_offset);
1178  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
1179 }
1180 
1181 
1182 //
1183 // Hybrid Matrix
1184 //
1185 
1186 
1187 template<typename NumericT>
1188 __global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords,
1189  const NumericT * ell_elements,
1190  const unsigned int * csr_rows,
1191  const unsigned int * csr_cols,
1192  const NumericT * csr_elements,
1193  unsigned int internal_row_num,
1194  unsigned int items_per_row,
1195  const NumericT * p,
1196  NumericT * Ap,
1197  const NumericT * r0star,
1198  unsigned int size,
1199  NumericT * inner_prod_buffer,
1200  unsigned int buffer_size,
1201  unsigned int buffer_offset)
1202 {
1203  NumericT inner_prod_ApAp = 0;
1204  NumericT inner_prod_pAp = 0;
1205  NumericT inner_prod_r0Ap = 0;
1206  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1207  unsigned int glb_sz = gridDim.x * blockDim.x;
1208 
1209  for (unsigned int row = glb_id; row < size; row += glb_sz)
1210  {
1211  NumericT sum = 0;
1212 
1213  unsigned int offset = row;
1214  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1215  {
1216  NumericT val = ell_elements[offset];
1217 
1218  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
1219  }
1220 
1221  unsigned int col_begin = csr_rows[row];
1222  unsigned int col_end = csr_rows[row + 1];
1223 
1224  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
1225  {
1226  sum += p[csr_cols[item_id]] * csr_elements[item_id];
1227  }
1228 
1229  Ap[row] = sum;
1230  inner_prod_ApAp += sum * sum;
1231  inner_prod_pAp += sum * p[row];
1232  inner_prod_r0Ap += sum * r0star[row];
1233  }
1234 
1236  __shared__ NumericT shared_array_ApAp[256];
1237  __shared__ NumericT shared_array_pAp[256];
1238  __shared__ NumericT shared_array_r0Ap[256];
1239  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1240  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1241  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1242  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1243  {
1244  __syncthreads();
1245  if (threadIdx.x < stride)
1246  {
1247  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1248  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1249  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1250  }
1251  }
1252 
1253  // write results to result array
1254  if (threadIdx.x == 0) {
1255  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1256  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1257  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1258  }
1259 }
1260 
1261 
1262 
1263 template<typename NumericT>
1265  vector_base<NumericT> const & p,
1266  vector_base<NumericT> & Ap,
1267  vector_base<NumericT> const & r0star,
1268  vector_base<NumericT> & inner_prod_buffer,
1269  vcl_size_t buffer_chunk_size,
1270  vcl_size_t buffer_chunk_offset)
1271 {
1272  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1273  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1274  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1275 
1276  pipelined_bicgstab_hyb_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1277  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1278  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1279  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
1280  detail::cuda_arg<NumericT>(A.handle5().cuda_handle()),
1281  static_cast<unsigned int>(A.internal_size1()),
1282  static_cast<unsigned int>(A.ell_nnz()),
1283  detail::cuda_arg<NumericT>(p),
1284  detail::cuda_arg<NumericT>(Ap),
1285  detail::cuda_arg<NumericT>(r0star),
1286  vec_size,
1287  detail::cuda_arg<NumericT>(inner_prod_buffer),
1288  chunk_size,
1289  chunk_offset);
1290  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
1291 }
1292 
1294 
1295 template <typename T>
1297  unsigned int vk_offset,
1298  T const * residual,
1299  T * R_buffer,
1300  unsigned int R_offset,
1301  T const * inner_prod_buffer,
1302  unsigned int chunk_size,
1303  T * r_dot_vk_buffer,
1304  unsigned int chunk_offset,
1305  unsigned int size)
1306 {
1307  __shared__ T shared_array[128];
1308  T norm_vk = 0;
1309 
1310  // parallel reduction in work group to compute <vk, vk>
1311  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1312  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1313  {
1314  __syncthreads();
1315  if (threadIdx.x < stride)
1316  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1317  }
1318 
1319  // compute alpha from reduced values:
1320  __syncthreads();
1321  norm_vk = sqrt(shared_array[0]);
1322 
1323  T inner_prod_contrib = 0;
1324  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1325  T value_vk = vk[i + vk_offset] / norm_vk;
1326 
1327  inner_prod_contrib += residual[i] * value_vk;
1328 
1329  vk[i + vk_offset] = value_vk;
1330  }
1331  __syncthreads();
1332 
1333  // parallel reduction in work group
1334  shared_array[threadIdx.x] = inner_prod_contrib;
1335  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1336  {
1337  __syncthreads();
1338  if (threadIdx.x < stride)
1339  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1340  }
1341 
1342  // write results of first reduction stage:
1343  if (threadIdx.x == 0)
1344  r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1345  // store norm:
1346  if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1347  R_buffer[R_offset] = norm_vk;
1348 }
1349 
1357 template <typename T>
1359  vector_base<T> const & residual,
1360  vector_base<T> & R_buffer,
1361  vcl_size_t offset_in_R,
1362  vector_base<T> const & inner_prod_buffer,
1363  vector_base<T> & r_dot_vk_buffer,
1364  vcl_size_t buffer_chunk_size,
1365  vcl_size_t buffer_chunk_offset)
1366 {
1367  unsigned int vk_offset = viennacl::traits::start(v_k);
1368  unsigned int R_offset = offset_in_R;
1369  unsigned int chunk_size = buffer_chunk_size;
1370  unsigned int chunk_offset = buffer_chunk_offset;
1371  unsigned int size = v_k.size();
1372 
1373  pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(detail::cuda_arg<T>(v_k),
1374  vk_offset,
1375  detail::cuda_arg<T>(residual),
1376  detail::cuda_arg<T>(R_buffer),
1377  R_offset,
1378  detail::cuda_arg<T>(inner_prod_buffer),
1379  chunk_size,
1380  detail::cuda_arg<T>(r_dot_vk_buffer),
1381  chunk_offset,
1382  size);
1383  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
1384 }
1385 
1386 
1387 
1388 template <typename T>
1389 __global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis,
1390  unsigned int size,
1391  unsigned int internal_size,
1392  unsigned int k,
1393  T * vi_in_vk_buffer,
1394  unsigned int chunk_size)
1395 {
1396  __shared__ T shared_array[7*128];
1397  T vi_in_vk[7];
1398  T value_vk = 0;
1399 
1400  unsigned int k_base = 0;
1401  while (k_base < k)
1402  {
1403  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1404  vi_in_vk[0] = 0;
1405  vi_in_vk[1] = 0;
1406  vi_in_vk[2] = 0;
1407  vi_in_vk[3] = 0;
1408  vi_in_vk[4] = 0;
1409  vi_in_vk[5] = 0;
1410  vi_in_vk[6] = 0;
1411  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1412  value_vk = krylov_basis[i + k * internal_size];
1413 
1414  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1415  vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size];
1416  }
1417 
1418  // parallel reduction in work group
1419  for (uint j=0; j<vecs_in_iteration; ++j)
1420  shared_array[threadIdx.x + j*chunk_size] = vi_in_vk[j];
1421  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1422  {
1423  __syncthreads();
1424  if (threadIdx.x < stride) {
1425  for (uint j=0; j<vecs_in_iteration; ++j)
1426  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1427  }
1428  }
1429 
1430  // write results to result array
1431  if (threadIdx.x == 0)
1432  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1433  vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1434 
1435  k_base += vecs_in_iteration;
1436  }
1437 
1438 }
1439 
1440 template <typename T>
1441 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
1442  vcl_size_t v_k_size,
1443  vcl_size_t v_k_internal_size,
1444  vcl_size_t param_k,
1445  vector_base<T> & vi_in_vk_buffer,
1446  vcl_size_t buffer_chunk_size)
1447 {
1448  unsigned int chunk_size = buffer_chunk_size;
1449  unsigned int size = v_k_size;
1450  unsigned int internal_size = v_k_internal_size;
1451  unsigned int k = param_k;
1452 
1453  pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1454  size,
1455  internal_size,
1456  k,
1457  detail::cuda_arg<T>(vi_in_vk_buffer),
1458  chunk_size);
1459  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
1460 }
1461 
1462 
1463 
1464 
1465 template <typename T>
1466 __global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
1467  unsigned int size,
1468  unsigned int internal_size,
1469  unsigned int k,
1470  T const * vi_in_vk_buffer,
1471  unsigned int chunk_size,
1472  T * R_buffer,
1473  unsigned int krylov_dim,
1474  T * inner_prod_buffer)
1475 {
1476  __shared__ T shared_array[7*128];
1477  T vk_dot_vk = 0;
1478  T value_vk = 0;
1479 
1480  unsigned int k_base = 0;
1481  while (k_base < k)
1482  {
1483  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1484 
1485  // parallel reduction in work group for <v_i, v_k>
1486  for (uint j=0; j<vecs_in_iteration; ++j)
1487  shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1488  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1489  {
1490  __syncthreads();
1491  if (threadIdx.x < stride) {
1492  for (uint j=0; j<vecs_in_iteration; ++j)
1493  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1494  }
1495  }
1496  __syncthreads();
1497 
1498  // v_k -= <v_i, v_k> v_i:
1499  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1500  {
1501  value_vk = krylov_basis[i + k * internal_size];
1502 
1503  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1504  value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size];
1505  vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1506  krylov_basis[i + k * internal_size] = value_vk;
1507  }
1508 
1509  // write to R: (to avoid thread divergence, all threads write the same value)
1510  if (blockIdx.x == 0)
1511  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1512  R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1513  __syncthreads();
1514 
1515  k_base += vecs_in_iteration;
1516  }
1517 
1518  // parallel reduction in work group for <v_k, v_k>
1519  shared_array[threadIdx.x] = vk_dot_vk;
1520  for (uint stride=blockDim.x/2; stride > 0; stride /= 2)
1521  {
1522  __syncthreads();
1523  if (threadIdx.x < stride)
1524  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1525  }
1526 
1527  // write results to result array
1528  if (threadIdx.x == 0)
1529  inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1530 }
1531 
1532 template <typename T>
1534  vcl_size_t v_k_size,
1535  vcl_size_t v_k_internal_size,
1536  vcl_size_t param_k,
1537  vector_base<T> const & vi_in_vk_buffer,
1538  vector_base<T> & R_buffer,
1539  vcl_size_t krylov_dim,
1540  vector_base<T> & inner_prod_buffer,
1541  vcl_size_t buffer_chunk_size)
1542 {
1543  unsigned int chunk_size = buffer_chunk_size;
1544  unsigned int size = v_k_size;
1545  unsigned int internal_size = v_k_internal_size;
1546  unsigned int k = param_k;
1547  unsigned int krylov = krylov_dim;
1548 
1549  pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1550  size,
1551  internal_size,
1552  k,
1553  detail::cuda_arg<T>(vi_in_vk_buffer),
1554  chunk_size,
1555  detail::cuda_arg<T>(R_buffer),
1556  krylov,
1557  detail::cuda_arg<T>(inner_prod_buffer));
1558  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
1559 }
1560 
1561 
1562 
1563 
1564 template <typename T>
1565 __global__ void pipelined_gmres_update_result_kernel(T * result,
1566  T const * residual,
1567  T const * krylov_basis,
1568  unsigned int size,
1569  unsigned int internal_size,
1570  T const * coefficients,
1571  unsigned int k)
1572 {
1573  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1574  {
1575  T value_result = result[i] + coefficients[0] * residual[i];
1576 
1577  for (unsigned int j = 1; j < k; ++j)
1578  value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
1579 
1580  result[i] = value_result;
1581  }
1582 }
1583 
1584 template <typename T>
1586  vector_base<T> const & residual,
1587  vector_base<T> const & krylov_basis,
1588  vcl_size_t v_k_size,
1589  vcl_size_t v_k_internal_size,
1590  vector_base<T> const & coefficients,
1591  vcl_size_t param_k)
1592 {
1593  unsigned int size = v_k_size;
1594  unsigned int internal_size = v_k_internal_size;
1595  unsigned int k = param_k;
1596 
1597  pipelined_gmres_update_result_kernel<<<128, 128>>>(detail::cuda_arg<T>(result),
1598  detail::cuda_arg<T>(residual),
1599  detail::cuda_arg<T>(krylov_basis),
1600  size,
1601  internal_size,
1602  detail::cuda_arg<T>(coefficients),
1603  k);
1604  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
1605 }
1606 
1607 
1608 
1609 template <typename T>
1611  vector_base<T> const & p,
1612  vector_base<T> & Ap,
1613  vector_base<T> & inner_prod_buffer)
1614 {
1615  unsigned int size = p.size();
1616  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1617 
1618  pipelined_cg_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1619  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1620  detail::cuda_arg<T>(A.handle().cuda_handle()),
1621  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1622  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1623  size,
1624  detail::cuda_arg<T>(inner_prod_buffer),
1625  buffer_size_per_vector);
1626  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
1627 }
1628 
1629 template <typename T>
1631  vector_base<T> const & p,
1632  vector_base<T> & Ap,
1633  vector_base<T> & inner_prod_buffer)
1634 {
1635  unsigned int size = p.size();
1636  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1637 
1638  Ap.clear();
1639 
1640  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
1641  detail::cuda_arg<T>(A.handle().cuda_handle()),
1642  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1643  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1644  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1645  size,
1646  detail::cuda_arg<T>(inner_prod_buffer),
1647  buffer_size_per_vector);
1648  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
1649 }
1650 
1651 template <typename T>
1653  vector_base<T> const & p,
1654  vector_base<T> & Ap,
1655  vector_base<T> & inner_prod_buffer)
1656 {
1657  unsigned int size = p.size();
1658  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1659 
1660  pipelined_cg_ell_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1661  detail::cuda_arg<T>(A.handle().cuda_handle()),
1662  static_cast<unsigned int>(A.internal_size1()),
1663  static_cast<unsigned int>(A.maxnnz()),
1664  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1665  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1666  size,
1667  detail::cuda_arg<T>(inner_prod_buffer),
1668  buffer_size_per_vector);
1669  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
1670 }
1671 
1672 template <typename T>
1674  vector_base<T> const & p,
1675  vector_base<T> & Ap,
1676  vector_base<T> & inner_prod_buffer)
1677 {
1678  unsigned int size = p.size();
1679  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1680 
1681  pipelined_cg_sliced_ell_vec_mul_kernel<<<128, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1682  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1683  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1684  detail::cuda_arg<T>(A.handle().cuda_handle()),
1685  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1686  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1687  size,
1688  detail::cuda_arg<T>(inner_prod_buffer),
1689  buffer_size_per_vector);
1690  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
1691 }
1692 
1693 
1694 template <typename T>
1696  vector_base<T> const & p,
1697  vector_base<T> & Ap,
1698  vector_base<T> & inner_prod_buffer)
1699 {
1700  unsigned int size = p.size();
1701  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1702 
1703  pipelined_cg_hyb_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1704  detail::cuda_arg<T>(A.handle().cuda_handle()),
1705  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1706  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
1707  detail::cuda_arg<T>(A.handle5().cuda_handle()),
1708  static_cast<unsigned int>(A.internal_size1()),
1709  static_cast<unsigned int>(A.ell_nnz()),
1710  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1711  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1712  size,
1713  detail::cuda_arg<T>(inner_prod_buffer),
1714  buffer_size_per_vector);
1715  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
1716 }
1717 
1718 
1719 
1720 } // namespace cuda
1721 } //namespace linalg
1722 } //namespace viennacl
1723 
1724 
1725 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
handle_type & handle2()
Definition: ell_matrix.hpp:103
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:268
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Definition: preset.hpp:246
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
std::size_t vcl_size_t
Definition: forwards.h:74
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
__global__ void pipelined_bicgstab_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
handle_type & handle()
Definition: ell_matrix.hpp:100
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
Implementation of the ViennaCL scalar class.
__global__ void pipelined_cg_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...