ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations_solve.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 
27 namespace viennacl
28 {
29 namespace linalg
30 {
31 namespace cuda
32 {
33 //
34 // Compressed matrix
35 //
36 
37 //
38 // non-transposed
39 //
40 
41 template<typename NumericT>
43  const unsigned int * row_indices,
44  const unsigned int * column_indices,
45  const NumericT * elements,
46  NumericT * vector,
47  unsigned int size)
48 {
49  __shared__ unsigned int col_index_buffer[128];
50  __shared__ NumericT element_buffer[128];
51  __shared__ NumericT vector_buffer[128];
52 
53  unsigned int nnz = row_indices[size];
54  unsigned int current_row = 0;
55  unsigned int row_at_window_start = 0;
56  NumericT current_vector_entry = vector[0];
57  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
58  unsigned int next_row = row_indices[1];
59 
60  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
61  {
62  //load into shared memory (coalesced access):
63  if (i < nnz)
64  {
65  element_buffer[threadIdx.x] = elements[i];
66  unsigned int tmp = column_indices[i];
67  col_index_buffer[threadIdx.x] = tmp;
68  vector_buffer[threadIdx.x] = vector[tmp];
69  }
70 
71  __syncthreads();
72 
73  //now a single thread does the remaining work in shared memory:
74  if (threadIdx.x == 0)
75  {
76  // traverse through all the loaded data:
77  for (unsigned int k=0; k<blockDim.x; ++k)
78  {
79  if (current_row < size && i+k == next_row) //current row is finished. Write back result
80  {
81  vector[current_row] = current_vector_entry;
82  ++current_row;
83  if (current_row < size) //load next row's data
84  {
85  next_row = row_indices[current_row+1];
86  current_vector_entry = vector[current_row];
87  }
88  }
89 
90  if (current_row < size && col_index_buffer[k] < current_row) //substitute
91  {
92  if (col_index_buffer[k] < row_at_window_start) //use recently computed results
93  current_vector_entry -= element_buffer[k] * vector_buffer[k];
94  else if (col_index_buffer[k] < current_row) //use buffered data
95  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
96  }
97 
98  } // for k
99 
100  row_at_window_start = current_row;
101  } // if (get_local_id(0) == 0)
102 
103  __syncthreads();
104  } //for i
105 }
106 
107 
108 
109 template<typename NumericT>
110 __global__ void csr_lu_forward_kernel(
111  const unsigned int * row_indices,
112  const unsigned int * column_indices,
113  const NumericT * elements,
114  NumericT * vector,
115  unsigned int size)
116 {
117  __shared__ unsigned int col_index_buffer[128];
118  __shared__ NumericT element_buffer[128];
119  __shared__ NumericT vector_buffer[128];
120 
121  unsigned int nnz = row_indices[size];
122  unsigned int current_row = 0;
123  unsigned int row_at_window_start = 0;
124  NumericT current_vector_entry = vector[0];
125  NumericT diagonal_entry = 0;
126  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
127  unsigned int next_row = row_indices[1];
128 
129  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
130  {
131  //load into shared memory (coalesced access):
132  if (i < nnz)
133  {
134  element_buffer[threadIdx.x] = elements[i];
135  unsigned int tmp = column_indices[i];
136  col_index_buffer[threadIdx.x] = tmp;
137  vector_buffer[threadIdx.x] = vector[tmp];
138  }
139 
140  __syncthreads();
141 
142  //now a single thread does the remaining work in shared memory:
143  if (threadIdx.x == 0)
144  {
145  // traverse through all the loaded data:
146  for (unsigned int k=0; k<blockDim.x; ++k)
147  {
148  if (current_row < size && i+k == next_row) //current row is finished. Write back result
149  {
150  vector[current_row] = current_vector_entry / diagonal_entry;
151  ++current_row;
152  if (current_row < size) //load next row's data
153  {
154  next_row = row_indices[current_row+1];
155  current_vector_entry = vector[current_row];
156  }
157  }
158 
159  if (current_row < size && col_index_buffer[k] < current_row) //substitute
160  {
161  if (col_index_buffer[k] < row_at_window_start) //use recently computed results
162  current_vector_entry -= element_buffer[k] * vector_buffer[k];
163  else if (col_index_buffer[k] < current_row) //use buffered data
164  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
165  }
166  else if (col_index_buffer[k] == current_row)
167  diagonal_entry = element_buffer[k];
168 
169  } // for k
170 
171  row_at_window_start = current_row;
172  } // if (get_local_id(0) == 0)
173 
174  __syncthreads();
175  } //for i
176 }
177 
178 
179 template<typename NumericT>
181  const unsigned int * row_indices,
182  const unsigned int * column_indices,
183  const NumericT * elements,
184  NumericT * vector,
185  unsigned int size)
186 {
187  __shared__ unsigned int col_index_buffer[128];
188  __shared__ NumericT element_buffer[128];
189  __shared__ NumericT vector_buffer[128];
190 
191  unsigned int nnz = row_indices[size];
192  unsigned int current_row = size-1;
193  unsigned int row_at_window_start = size-1;
194  NumericT current_vector_entry = vector[size-1];
195  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
196  unsigned int next_row = row_indices[size-1];
197 
198  unsigned int i = loop_end + threadIdx.x;
199  while (1)
200  {
201  //load into shared memory (coalesced access):
202  if (i < nnz)
203  {
204  element_buffer[threadIdx.x] = elements[i];
205  unsigned int tmp = column_indices[i];
206  col_index_buffer[threadIdx.x] = tmp;
207  vector_buffer[threadIdx.x] = vector[tmp];
208  }
209 
210  __syncthreads();
211 
212  //now a single thread does the remaining work in shared memory:
213  if (threadIdx.x == 0)
214  {
215  // traverse through all the loaded data from back to front:
216  for (unsigned int k2=0; k2<blockDim.x; ++k2)
217  {
218  unsigned int k = (blockDim.x - k2) - 1;
219 
220  if (i+k >= nnz)
221  continue;
222 
223  if (col_index_buffer[k] > row_at_window_start) //use recently computed results
224  current_vector_entry -= element_buffer[k] * vector_buffer[k];
225  else if (col_index_buffer[k] > current_row) //use buffered data
226  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
227 
228  if (i+k == next_row) //current row is finished. Write back result
229  {
230  vector[current_row] = current_vector_entry;
231  if (current_row > 0) //load next row's data
232  {
233  --current_row;
234  next_row = row_indices[current_row];
235  current_vector_entry = vector[current_row];
236  }
237  }
238 
239 
240  } // for k
241 
242  row_at_window_start = current_row;
243  } // if (get_local_id(0) == 0)
244 
245  __syncthreads();
246 
247  if (i < blockDim.x)
248  break;
249 
250  i -= blockDim.x;
251  } //for i
252 }
253 
254 
255 
256 template<typename NumericT>
257 __global__ void csr_lu_backward_kernel(
258  const unsigned int * row_indices,
259  const unsigned int * column_indices,
260  const NumericT * elements,
261  NumericT * vector,
262  unsigned int size)
263 {
264  __shared__ unsigned int col_index_buffer[128];
265  __shared__ NumericT element_buffer[128];
266  __shared__ NumericT vector_buffer[128];
267 
268  unsigned int nnz = row_indices[size];
269  unsigned int current_row = size-1;
270  unsigned int row_at_window_start = size-1;
271  NumericT current_vector_entry = vector[size-1];
272  NumericT diagonal_entry;
273  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
274  unsigned int next_row = row_indices[size-1];
275 
276  unsigned int i = loop_end + threadIdx.x;
277  while (1)
278  {
279  //load into shared memory (coalesced access):
280  if (i < nnz)
281  {
282  element_buffer[threadIdx.x] = elements[i];
283  unsigned int tmp = column_indices[i];
284  col_index_buffer[threadIdx.x] = tmp;
285  vector_buffer[threadIdx.x] = vector[tmp];
286  }
287 
288  __syncthreads();
289 
290  //now a single thread does the remaining work in shared memory:
291  if (threadIdx.x == 0)
292  {
293  // traverse through all the loaded data from back to front:
294  for (unsigned int k2=0; k2<blockDim.x; ++k2)
295  {
296  unsigned int k = (blockDim.x - k2) - 1;
297 
298  if (i+k >= nnz)
299  continue;
300 
301  if (col_index_buffer[k] > row_at_window_start) //use recently computed results
302  current_vector_entry -= element_buffer[k] * vector_buffer[k];
303  else if (col_index_buffer[k] > current_row) //use buffered data
304  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
305  else if (col_index_buffer[k] == current_row)
306  diagonal_entry = element_buffer[k];
307 
308  if (i+k == next_row) //current row is finished. Write back result
309  {
310  vector[current_row] = current_vector_entry / diagonal_entry;
311  if (current_row > 0) //load next row's data
312  {
313  --current_row;
314  next_row = row_indices[current_row];
315  current_vector_entry = vector[current_row];
316  }
317  }
318 
319 
320  } // for k
321 
322  row_at_window_start = current_row;
323  } // if (get_local_id(0) == 0)
324 
325  __syncthreads();
326 
327  if (i < blockDim.x)
328  break;
329 
330  i -= blockDim.x;
331  } //for i
332 }
333 
334 
335 
336 //
337 // transposed
338 //
339 
340 
341 template<typename NumericT>
343  const unsigned int * row_indices,
344  const unsigned int * column_indices,
345  const NumericT * elements,
346  NumericT * vector,
347  unsigned int size)
348 {
349  for (unsigned int row = 0; row < size; ++row)
350  {
351  NumericT result_entry = vector[row];
352 
353  unsigned int row_start = row_indices[row];
354  unsigned int row_stop = row_indices[row + 1];
355  for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
356  {
357  unsigned int col_index = column_indices[entry_index];
358  if (col_index > row)
359  vector[col_index] -= result_entry * elements[entry_index];
360  }
361 
362  __syncthreads();
363  }
364 }
365 
366 template<typename NumericT>
368  const unsigned int * row_indices,
369  const unsigned int * column_indices,
370  const NumericT * elements,
371  NumericT * vector,
372  unsigned int size)
373 {
374  __shared__ unsigned int row_index_lookahead[256];
375  __shared__ unsigned int row_index_buffer[256];
376 
377  unsigned int row_index;
378  unsigned int col_index;
379  NumericT matrix_entry;
380  unsigned int nnz = row_indices[size];
381  unsigned int row_at_window_start = 0;
382  unsigned int row_at_window_end = 0;
383  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
384 
385  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
386  {
387  col_index = (i < nnz) ? column_indices[i] : 0;
388  matrix_entry = (i < nnz) ? elements[i] : 0;
389  row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
390 
391  __syncthreads();
392 
393  if (i < nnz)
394  {
395  unsigned int row_index_inc = 0;
396  while (i >= row_index_lookahead[row_index_inc + 1])
397  ++row_index_inc;
398  row_index = row_at_window_start + row_index_inc;
399  row_index_buffer[threadIdx.x] = row_index;
400  }
401  else
402  {
403  row_index = size+1;
404  row_index_buffer[threadIdx.x] = size - 1;
405  }
406 
407  __syncthreads();
408 
409  row_at_window_start = row_index_buffer[0];
410  row_at_window_end = row_index_buffer[blockDim.x - 1];
411 
412  //forward elimination
413  for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
414  {
415  NumericT result_entry = vector[row];
416 
417  if ( (row_index == row) && (col_index > row) )
418  vector[col_index] -= result_entry * matrix_entry;
419 
420  __syncthreads();
421  }
422 
423  row_at_window_start = row_at_window_end;
424  }
425 
426 }
427 
428 template<typename NumericT>
430  const unsigned int * row_indices,
431  const unsigned int * column_indices,
432  const NumericT * elements,
433  const NumericT * diagonal_entries,
434  NumericT * vector,
435  unsigned int size)
436 {
437  __shared__ unsigned int row_index_lookahead[256];
438  __shared__ unsigned int row_index_buffer[256];
439 
440  unsigned int row_index;
441  unsigned int col_index;
442  NumericT matrix_entry;
443  unsigned int nnz = row_indices[size];
444  unsigned int row_at_window_start = 0;
445  unsigned int row_at_window_end = 0;
446  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
447 
448  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
449  {
450  col_index = (i < nnz) ? column_indices[i] : 0;
451  matrix_entry = (i < nnz) ? elements[i] : 0;
452  row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
453 
454  __syncthreads();
455 
456  if (i < nnz)
457  {
458  unsigned int row_index_inc = 0;
459  while (i >= row_index_lookahead[row_index_inc + 1])
460  ++row_index_inc;
461  row_index = row_at_window_start + row_index_inc;
462  row_index_buffer[threadIdx.x] = row_index;
463  }
464  else
465  {
466  row_index = size+1;
467  row_index_buffer[threadIdx.x] = size - 1;
468  }
469 
470  __syncthreads();
471 
472  row_at_window_start = row_index_buffer[0];
473  row_at_window_end = row_index_buffer[blockDim.x - 1];
474 
475  //forward elimination
476  for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
477  {
478  NumericT result_entry = vector[row] / diagonal_entries[row];
479 
480  if ( (row_index == row) && (col_index > row) )
481  vector[col_index] -= result_entry * matrix_entry;
482 
483  __syncthreads();
484  }
485 
486  row_at_window_start = row_at_window_end;
487  }
488 
489  // final step: Divide vector by diagonal entries:
490  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
491  vector[i] /= diagonal_entries[i];
492 
493 }
494 
495 
496 template<typename NumericT>
498  const unsigned int * row_indices,
499  const unsigned int * column_indices,
500  const NumericT * elements,
501  NumericT * vector,
502  unsigned int size)
503 {
504  __shared__ unsigned int row_index_lookahead[256];
505  __shared__ unsigned int row_index_buffer[256];
506 
507  unsigned int row_index;
508  unsigned int col_index;
509  NumericT matrix_entry;
510  unsigned int nnz = row_indices[size];
511  unsigned int row_at_window_start = size;
512  unsigned int row_at_window_end;
513  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
514 
515  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
516  {
517  unsigned int i = (nnz - i2) - 1;
518  col_index = (i2 < nnz) ? column_indices[i] : 0;
519  matrix_entry = (i2 < nnz) ? elements[i] : 0;
520  row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
521 
522  __syncthreads();
523 
524  if (i2 < nnz)
525  {
526  unsigned int row_index_dec = 0;
527  while (row_index_lookahead[row_index_dec] > i)
528  ++row_index_dec;
529  row_index = row_at_window_start - row_index_dec;
530  row_index_buffer[threadIdx.x] = row_index;
531  }
532  else
533  {
534  row_index = size+1;
535  row_index_buffer[threadIdx.x] = 0;
536  }
537 
538  __syncthreads();
539 
540  row_at_window_start = row_index_buffer[0];
541  row_at_window_end = row_index_buffer[blockDim.x - 1];
542 
543  //backward elimination
544  for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
545  {
546  unsigned int row = row_at_window_start - row2;
547  NumericT result_entry = vector[row];
548 
549  if ( (row_index == row) && (col_index < row) )
550  vector[col_index] -= result_entry * matrix_entry;
551 
552  __syncthreads();
553  }
554 
555  row_at_window_start = row_at_window_end;
556  }
557 
558 }
559 
560 
561 
562 template<typename NumericT>
564  const unsigned int * row_indices,
565  const unsigned int * column_indices,
566  const NumericT * elements,
567  const NumericT * diagonal_entries,
568  NumericT * vector,
569  unsigned int size)
570 {
571  NumericT result_entry = 0;
572 
573  //backward elimination, using U and D:
574  for (unsigned int row2 = 0; row2 < size; ++row2)
575  {
576  unsigned int row = (size - row2) - 1;
577  result_entry = vector[row] / diagonal_entries[row];
578 
579  unsigned int row_start = row_indices[row];
580  unsigned int row_stop = row_indices[row + 1];
581  for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
582  {
583  unsigned int col_index = column_indices[entry_index];
584  if (col_index < row)
585  vector[col_index] -= result_entry * elements[entry_index];
586  }
587 
588  __syncthreads();
589 
590  if (threadIdx.x == 0)
591  vector[row] = result_entry;
592  }
593 }
594 
595 
596 template<typename NumericT>
598  const unsigned int * row_indices,
599  const unsigned int * column_indices,
600  const NumericT * elements,
601  const NumericT * diagonal_entries,
602  NumericT * vector,
603  unsigned int size)
604 {
605  __shared__ unsigned int row_index_lookahead[256];
606  __shared__ unsigned int row_index_buffer[256];
607 
608  unsigned int row_index;
609  unsigned int col_index;
610  NumericT matrix_entry;
611  unsigned int nnz = row_indices[size];
612  unsigned int row_at_window_start = size;
613  unsigned int row_at_window_end;
614  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
615 
616  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
617  {
618  unsigned int i = (nnz - i2) - 1;
619  col_index = (i2 < nnz) ? column_indices[i] : 0;
620  matrix_entry = (i2 < nnz) ? elements[i] : 0;
621  row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
622 
623  __syncthreads();
624 
625  if (i2 < nnz)
626  {
627  unsigned int row_index_dec = 0;
628  while (row_index_lookahead[row_index_dec] > i)
629  ++row_index_dec;
630  row_index = row_at_window_start - row_index_dec;
631  row_index_buffer[threadIdx.x] = row_index;
632  }
633  else
634  {
635  row_index = size+1;
636  row_index_buffer[threadIdx.x] = 0;
637  }
638 
639  __syncthreads();
640 
641  row_at_window_start = row_index_buffer[0];
642  row_at_window_end = row_index_buffer[blockDim.x - 1];
643 
644  //backward elimination
645  for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
646  {
647  unsigned int row = row_at_window_start - row2;
648  NumericT result_entry = vector[row] / diagonal_entries[row];
649 
650  if ( (row_index == row) && (col_index < row) )
651  vector[col_index] -= result_entry * matrix_entry;
652 
653  __syncthreads();
654  }
655 
656  row_at_window_start = row_at_window_end;
657  }
658 
659 
660  // final step: Divide vector by diagonal entries:
661  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
662  vector[i] /= diagonal_entries[i];
663 
664 }
665 
666 
667 template<typename NumericT>
669  const unsigned int * row_jumper_L, //L part (note that L is transposed in memory)
670  const unsigned int * column_indices_L,
671  const NumericT * elements_L,
672  const unsigned int * block_offsets,
673  NumericT * result,
674  unsigned int size)
675 {
676  unsigned int col_start = block_offsets[2*blockIdx.x];
677  unsigned int col_stop = block_offsets[2*blockIdx.x+1];
678  unsigned int row_start = row_jumper_L[col_start];
679  unsigned int row_stop;
680  NumericT result_entry = 0;
681 
682  if (col_start >= col_stop)
683  return;
684 
685  //forward elimination, using L:
686  for (unsigned int col = col_start; col < col_stop; ++col)
687  {
688  result_entry = result[col];
689  row_stop = row_jumper_L[col + 1];
690  for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
691  result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
692  row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
693  __syncthreads();
694  }
695 
696 }
697 
698 
699 template<typename NumericT>
701  const unsigned int * row_jumper_U, //U part (note that U is transposed in memory)
702  const unsigned int * column_indices_U,
703  const NumericT * elements_U,
704  const NumericT * diagonal_U,
705  const unsigned int * block_offsets,
706  NumericT * result,
707  unsigned int size)
708 {
709  unsigned int col_start = block_offsets[2*blockIdx.x];
710  unsigned int col_stop = block_offsets[2*blockIdx.x+1];
711  unsigned int row_start;
712  unsigned int row_stop;
713  NumericT result_entry = 0;
714 
715  if (col_start >= col_stop)
716  return;
717 
718  //backward elimination, using U and diagonal_U
719  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
720  {
721  unsigned int col = (col_stop - iter) - 1;
722  result_entry = result[col] / diagonal_U[col];
723  row_start = row_jumper_U[col];
724  row_stop = row_jumper_U[col + 1];
725  for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
726  result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
727  __syncthreads();
728  }
729 
730  //divide result vector by diagonal:
731  for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
732  result[col] /= diagonal_U[col];
733 }
734 
735 
736 
737 //
738 // Coordinate Matrix
739 //
740 
741 
742 
743 
744 //
745 // ELL Matrix
746 //
747 
748 
749 
750 //
751 // Hybrid Matrix
752 //
753 
754 
755 
756 } // namespace opencl
757 } //namespace linalg
758 } //namespace viennacl
759 
760 
761 #endif
__global__ void csr_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
This file provides the forward declarations for the main types used within ViennaCL.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
__global__ void csr_trans_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_lu_backward(const unsigned int *row_jumper_U, const unsigned int *column_indices_U, const NumericT *elements_U, const NumericT *diagonal_U, const unsigned int *block_offsets, NumericT *result, unsigned int size)
__global__ void csr_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
__global__ void csr_trans_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_unit_lu_forward(const unsigned int *row_jumper_L, const unsigned int *column_indices_L, const NumericT *elements_L, const unsigned int *block_offsets, NumericT *result, unsigned int size)