1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
43 template<
typename NumericT>
50 NumericT * inner_prod_buffer,
53 NumericT inner_prod_contrib = 0;
54 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
56 NumericT value_p = p[i];
57 NumericT value_r = r[i];
59 result[i] += alpha * value_p;
60 value_r -= alpha * Ap[i];
61 value_p = value_r + beta * value_p;
65 inner_prod_contrib += value_r * value_r;
69 __shared__ NumericT shared_array[256];
70 shared_array[threadIdx.x] = inner_prod_contrib;
75 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
80 inner_prod_buffer[blockIdx.x] = shared_array[0];
84 template<
typename NumericT>
94 pipelined_cg_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
96 detail::cuda_arg<NumericT>(p),
97 detail::cuda_arg<NumericT>(r),
98 detail::cuda_arg<NumericT>(Ap),
100 detail::cuda_arg<NumericT>(inner_prod_buffer),
113 template<
typename NumericT>
115 const unsigned int * row_indices,
116 const unsigned int * column_indices,
117 const NumericT * elements,
121 NumericT * inner_prod_buffer,
122 unsigned int buffer_size)
124 NumericT inner_prod_ApAp = 0;
125 NumericT inner_prod_pAp = 0;
127 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
129 row += gridDim.x * blockDim.x)
132 unsigned int row_end = row_indices[
row+1];
133 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
134 dot_prod += elements[i] * p[column_indices[i]];
136 inner_prod_ApAp += dot_prod *
dot_prod;
141 __shared__ NumericT shared_array_ApAp[256];
142 __shared__ NumericT shared_array_pAp[256];
143 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
144 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
150 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
151 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
156 if (threadIdx.x == 0) {
157 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
158 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
165 template<
typename NumericT>
172 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
174 pipelined_cg_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
175 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
176 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
177 detail::cuda_arg<NumericT>(p),
178 detail::cuda_arg<NumericT>(Ap),
180 detail::cuda_arg<NumericT>(inner_prod_buffer),
181 buffer_size_per_vector);
191 template<
typename NumericT>
193 const NumericT * elements,
194 const unsigned int * group_boundaries,
198 NumericT * inner_prod_buffer,
199 unsigned int buffer_size)
201 NumericT inner_prod_ApAp = 0;
202 NumericT inner_prod_pAp = 0;
203 __shared__
unsigned int shared_rows[128];
204 __shared__ NumericT inter_results[128];
208 unsigned int group_start = group_boundaries[blockIdx.x];
209 unsigned int group_end = group_boundaries[blockIdx.x + 1];
210 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
212 unsigned int local_index = 0;
214 for (
unsigned int k = 0; k < k_end; ++k)
216 local_index = group_start + k * blockDim.x + threadIdx.x;
218 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
219 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
222 if (threadIdx.x == 0 && k > 0)
224 if (tmp.x == shared_rows[blockDim.x-1])
225 val += inter_results[blockDim.x-1];
228 NumericT Ap_entry = inter_results[blockDim.x-1];
229 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
230 inner_prod_ApAp += Ap_entry * Ap_entry;
231 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
237 shared_rows[threadIdx.x] = tmp.x;
238 inter_results[threadIdx.x] = val;
244 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
246 inter_results[threadIdx.x] += left;
251 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
252 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
254 NumericT Ap_entry = inter_results[threadIdx.x];
255 Ap[tmp.x] = Ap_entry;
256 inner_prod_ApAp += Ap_entry * Ap_entry;
257 inner_prod_pAp += Ap_entry * p[tmp.x];
263 if (local_index + 1 == group_end)
265 NumericT Ap_entry = inter_results[threadIdx.x];
266 Ap[tmp.x] = Ap_entry;
267 inner_prod_ApAp += Ap_entry * Ap_entry;
268 inner_prod_pAp += Ap_entry * p[tmp.x];
272 __shared__ NumericT shared_array_ApAp[256];
273 __shared__ NumericT shared_array_pAp[256];
274 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
275 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
281 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
282 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
287 if (threadIdx.x == 0) {
288 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
289 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
295 template<
typename NumericT>
302 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
306 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
307 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
308 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
309 detail::cuda_arg<NumericT>(p),
310 detail::cuda_arg<NumericT>(Ap),
312 detail::cuda_arg<NumericT>(inner_prod_buffer),
313 buffer_size_per_vector);
323 template<
typename NumericT>
325 const NumericT * elements,
326 unsigned int internal_row_num,
327 unsigned int items_per_row,
331 NumericT * inner_prod_buffer,
332 unsigned int buffer_size)
334 NumericT inner_prod_ApAp = 0;
335 NumericT inner_prod_pAp = 0;
336 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
337 unsigned int glb_sz = gridDim.x * blockDim.x;
343 unsigned int offset =
row;
344 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
346 NumericT val = elements[offset];
347 sum += val ? p[coords[offset]] * val : NumericT(0);
351 inner_prod_ApAp += sum *
sum;
352 inner_prod_pAp += sum * p[
row];
356 __shared__ NumericT shared_array_ApAp[256];
357 __shared__ NumericT shared_array_pAp[256];
358 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
359 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
365 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
366 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
371 if (threadIdx.x == 0) {
372 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
373 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
378 template<
typename NumericT>
385 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
387 pipelined_cg_ell_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
388 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
390 static_cast<unsigned int>(A.
maxnnz()),
391 detail::cuda_arg<NumericT>(p),
392 detail::cuda_arg<NumericT>(Ap),
394 detail::cuda_arg<NumericT>(inner_prod_buffer),
395 buffer_size_per_vector);
404 template<
typename NumericT>
406 const unsigned int * column_indices,
407 const unsigned int * block_start,
408 const NumericT * elements,
412 NumericT * inner_prod_buffer,
413 unsigned int buffer_size)
415 NumericT inner_prod_ApAp = 0;
416 NumericT inner_prod_pAp = 0;
417 unsigned int local_id = threadIdx.x;
418 unsigned int local_size = blockDim.x;
420 for (
unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
422 unsigned int row = block_idx * local_size + local_id;
423 unsigned int offset = block_start[block_idx];
424 unsigned int num_columns = columns_per_block[block_idx];
427 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
429 unsigned int index = offset + item_id * local_size + local_id;
430 NumericT val = elements[index];
432 sum += val ? (p[column_indices[index]] * val) : 0;
438 inner_prod_ApAp += sum *
sum;
439 inner_prod_pAp += sum * p[
row];
444 __shared__ NumericT shared_array_ApAp[256];
445 __shared__ NumericT shared_array_pAp[256];
446 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
447 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
453 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
454 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
459 if (threadIdx.x == 0) {
460 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
461 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
465 template<
typename NumericT>
472 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
475 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
476 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
477 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
478 detail::cuda_arg<NumericT>(p),
479 detail::cuda_arg<NumericT>(Ap),
481 detail::cuda_arg<NumericT>(inner_prod_buffer),
482 buffer_size_per_vector);
492 template<
typename NumericT>
494 const NumericT * ell_elements,
495 const unsigned int * csr_rows,
496 const unsigned int * csr_cols,
497 const NumericT * csr_elements,
498 unsigned int internal_row_num,
499 unsigned int items_per_row,
503 NumericT * inner_prod_buffer,
504 unsigned int buffer_size)
506 NumericT inner_prod_ApAp = 0;
507 NumericT inner_prod_pAp = 0;
508 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
509 unsigned int glb_sz = gridDim.x * blockDim.x;
515 unsigned int offset =
row;
516 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
518 NumericT val = ell_elements[offset];
520 sum += val ? p[ell_coords[offset]] * val : NumericT(0);
523 unsigned int col_begin = csr_rows[
row];
524 unsigned int col_end = csr_rows[
row + 1];
526 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
528 sum += p[csr_cols[item_id]] * csr_elements[item_id];
532 inner_prod_ApAp += sum *
sum;
533 inner_prod_pAp += sum * p[
row];
537 __shared__ NumericT shared_array_ApAp[256];
538 __shared__ NumericT shared_array_pAp[256];
539 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
540 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
546 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
547 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
552 if (threadIdx.x == 0) {
553 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
554 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
560 template<
typename NumericT>
567 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
569 pipelined_cg_hyb_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
570 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
571 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
572 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
573 detail::cuda_arg<NumericT>(A.
handle5().cuda_handle()),
575 static_cast<unsigned int>(A.
ell_nnz()),
576 detail::cuda_arg<NumericT>(p),
577 detail::cuda_arg<NumericT>(Ap),
579 detail::cuda_arg<NumericT>(inner_prod_buffer),
580 buffer_size_per_vector);
588 template<
typename NumericT>
590 NumericT
const * residual,
593 NumericT * inner_prod_buffer,
594 unsigned int chunk_size,
595 unsigned int chunk_offset)
600 __shared__ NumericT shared_array[256];
601 __shared__ NumericT shared_array_Ap_in_r0[256];
603 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
604 shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
608 if (threadIdx.x <
stride) {
609 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
610 shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x +
stride];
616 alpha = shared_array[0] / shared_array_Ap_in_r0[0];
619 NumericT inner_prod_contrib = 0;
620 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
622 NumericT value_s = s[i];
624 value_s = residual[i] - alpha * Ap[i];
625 inner_prod_contrib += value_s * value_s;
632 shared_array[threadIdx.x] = inner_prod_contrib;
637 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
641 if (threadIdx.x == 0)
642 inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
645 template<
typename NumericT>
653 unsigned int size =
static_cast<unsigned int>(s.
size());
654 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
655 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
657 pipelined_bicgstab_update_s_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(s),
658 detail::cuda_arg<NumericT>(r),
659 detail::cuda_arg<NumericT>(Ap),
661 detail::cuda_arg<NumericT>(inner_prod_buffer),
667 template<
typename NumericT>
677 NumericT
const * r0star,
678 NumericT * inner_prod_buffer,
681 NumericT inner_prod_r_r0star = 0;
682 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
684 NumericT value_result = result[i];
685 NumericT value_p = p[i];
686 NumericT value_s = s[i];
687 NumericT value_residual = residual[i];
688 NumericT value_As = As[i];
689 NumericT value_Ap = Ap[i];
690 NumericT value_r0star = r0star[i];
692 value_result += alpha * value_p + omega * value_s;
693 value_residual = value_s - omega * value_As;
694 value_p = value_residual + beta * (value_p - omega * value_Ap);
696 result[i] = value_result;
697 residual[i] = value_residual;
699 inner_prod_r_r0star += value_residual * value_r0star;
703 __shared__ NumericT shared_array[256];
704 shared_array[threadIdx.x] = inner_prod_r_r0star;
709 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
713 if (threadIdx.x == 0)
714 inner_prod_buffer[blockIdx.x] = shared_array[0];
718 template<
typename NumericT>
725 (void)buffer_chunk_size;
726 unsigned int size =
static_cast<unsigned int>(result.
size());
728 pipelined_bicgstab_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
730 detail::cuda_arg<NumericT>(p),
732 detail::cuda_arg<NumericT>(s),
733 detail::cuda_arg<NumericT>(residual),
734 detail::cuda_arg<NumericT>(As),
736 detail::cuda_arg<NumericT>(Ap),
737 detail::cuda_arg<NumericT>(r0star),
738 detail::cuda_arg<NumericT>(inner_prod_buffer),
750 template<
typename NumericT>
752 const unsigned int * row_indices,
753 const unsigned int * column_indices,
754 const NumericT * elements,
757 const NumericT * r0star,
759 NumericT * inner_prod_buffer,
760 unsigned int buffer_size,
761 unsigned int buffer_offset)
763 NumericT inner_prod_ApAp = 0;
764 NumericT inner_prod_pAp = 0;
765 NumericT inner_prod_r0Ap = 0;
767 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
769 row += gridDim.x * blockDim.x)
772 unsigned int row_end = row_indices[
row+1];
773 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
774 dot_prod += elements[i] * p[column_indices[i]];
776 inner_prod_ApAp += dot_prod *
dot_prod;
782 __shared__ NumericT shared_array_ApAp[256];
783 __shared__ NumericT shared_array_pAp[256];
784 __shared__ NumericT shared_array_r0Ap[256];
785 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
786 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
787 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
793 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
794 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
795 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
800 if (threadIdx.x == 0) {
801 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
802 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
803 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
810 template<
typename NumericT>
820 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
821 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
823 pipelined_bicgstab_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
824 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
825 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
826 detail::cuda_arg<NumericT>(p),
827 detail::cuda_arg<NumericT>(Ap),
828 detail::cuda_arg<NumericT>(r0star),
830 detail::cuda_arg<NumericT>(inner_prod_buffer),
842 template<
typename NumericT>
844 const NumericT * elements,
845 const unsigned int * group_boundaries,
848 const NumericT * r0star,
850 NumericT * inner_prod_buffer,
851 unsigned int buffer_size,
852 unsigned int buffer_offset)
854 NumericT inner_prod_ApAp = 0;
855 NumericT inner_prod_pAp = 0;
856 NumericT inner_prod_r0Ap = 0;
857 __shared__
unsigned int shared_rows[128];
858 __shared__ NumericT inter_results[128];
862 unsigned int group_start = group_boundaries[blockIdx.x];
863 unsigned int group_end = group_boundaries[blockIdx.x + 1];
864 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
866 unsigned int local_index = 0;
868 for (
unsigned int k = 0; k < k_end; ++k)
870 local_index = group_start + k * blockDim.x + threadIdx.x;
872 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
873 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
876 if (threadIdx.x == 0 && k > 0)
878 if (tmp.x == shared_rows[blockDim.x-1])
879 val += inter_results[blockDim.x-1];
882 NumericT Ap_entry = inter_results[blockDim.x-1];
883 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
884 inner_prod_ApAp += Ap_entry * Ap_entry;
885 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
886 inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
892 shared_rows[threadIdx.x] = tmp.x;
893 inter_results[threadIdx.x] = val;
899 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
901 inter_results[threadIdx.x] += left;
906 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
907 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
909 NumericT Ap_entry = inter_results[threadIdx.x];
910 Ap[tmp.x] = Ap_entry;
911 inner_prod_ApAp += Ap_entry * Ap_entry;
912 inner_prod_pAp += Ap_entry * p[tmp.x];
913 inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
919 if (local_index + 1 == group_end)
921 NumericT Ap_entry = inter_results[threadIdx.x];
922 Ap[tmp.x] = Ap_entry;
923 inner_prod_ApAp += Ap_entry * Ap_entry;
924 inner_prod_pAp += Ap_entry * p[tmp.x];
925 inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
929 __shared__ NumericT shared_array_ApAp[256];
930 __shared__ NumericT shared_array_pAp[256];
931 __shared__ NumericT shared_array_r0Ap[256];
932 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
933 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
934 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
940 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
941 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
942 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
947 if (threadIdx.x == 0) {
948 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
949 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
950 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
956 template<
typename NumericT>
966 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
967 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
971 pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
972 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
973 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
974 detail::cuda_arg<NumericT>(p),
975 detail::cuda_arg<NumericT>(Ap),
976 detail::cuda_arg<NumericT>(r0star),
978 detail::cuda_arg<NumericT>(inner_prod_buffer),
990 template<
typename NumericT>
992 const NumericT * elements,
993 unsigned int internal_row_num,
994 unsigned int items_per_row,
997 const NumericT * r0star,
999 NumericT * inner_prod_buffer,
1000 unsigned int buffer_size,
1001 unsigned int buffer_offset)
1003 NumericT inner_prod_ApAp = 0;
1004 NumericT inner_prod_pAp = 0;
1005 NumericT inner_prod_r0Ap = 0;
1006 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1007 unsigned int glb_sz = gridDim.x * blockDim.x;
1013 unsigned int offset =
row;
1014 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1016 NumericT val = elements[offset];
1017 sum += val ? p[coords[offset]] * val : NumericT(0);
1021 inner_prod_ApAp += sum *
sum;
1022 inner_prod_pAp += sum * p[
row];
1023 inner_prod_r0Ap += sum * r0star[
row];
1027 __shared__ NumericT shared_array_ApAp[256];
1028 __shared__ NumericT shared_array_pAp[256];
1029 __shared__ NumericT shared_array_r0Ap[256];
1030 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1031 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1032 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1036 if (threadIdx.x <
stride)
1038 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1039 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1040 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1045 if (threadIdx.x == 0) {
1046 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1047 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1048 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1053 template<
typename NumericT>
1063 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1064 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1066 pipelined_bicgstab_ell_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1067 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1069 static_cast<unsigned int>(A.
maxnnz()),
1070 detail::cuda_arg<NumericT>(p),
1071 detail::cuda_arg<NumericT>(Ap),
1072 detail::cuda_arg<NumericT>(r0star),
1074 detail::cuda_arg<NumericT>(inner_prod_buffer),
1085 template<
typename NumericT>
1087 const unsigned int * column_indices,
1088 const unsigned int * block_start,
1089 const NumericT * elements,
1092 const NumericT * r0star,
1094 NumericT * inner_prod_buffer,
1095 unsigned int buffer_size,
1096 unsigned int buffer_offset)
1098 NumericT inner_prod_ApAp = 0;
1099 NumericT inner_prod_pAp = 0;
1100 NumericT inner_prod_r0Ap = 0;
1101 unsigned int local_id = threadIdx.x;
1102 unsigned int local_size = blockDim.x;
1104 for (
unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
1106 unsigned int row = block_idx * local_size + local_id;
1107 unsigned int offset = block_start[block_idx];
1108 unsigned int num_columns = columns_per_block[block_idx];
1111 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
1113 unsigned int index = offset + item_id * local_size + local_id;
1114 NumericT val = elements[index];
1116 sum += val ? (p[column_indices[index]] * val) : 0;
1122 inner_prod_ApAp += sum *
sum;
1123 inner_prod_pAp += sum * p[
row];
1124 inner_prod_r0Ap += sum * r0star[
row];
1129 __shared__ NumericT shared_array_ApAp[256];
1130 __shared__ NumericT shared_array_pAp[256];
1131 __shared__ NumericT shared_array_r0Ap[256];
1132 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1133 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1134 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1138 if (threadIdx.x <
stride)
1140 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1141 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1142 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1147 if (threadIdx.x == 0) {
1148 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1149 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1150 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1154 template<
typename NumericT>
1164 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1165 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1168 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1169 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1170 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1171 detail::cuda_arg<NumericT>(p),
1172 detail::cuda_arg<NumericT>(Ap),
1173 detail::cuda_arg<NumericT>(r0star),
1175 detail::cuda_arg<NumericT>(inner_prod_buffer),
1187 template<
typename NumericT>
1189 const NumericT * ell_elements,
1190 const unsigned int * csr_rows,
1191 const unsigned int * csr_cols,
1192 const NumericT * csr_elements,
1193 unsigned int internal_row_num,
1194 unsigned int items_per_row,
1197 const NumericT * r0star,
1199 NumericT * inner_prod_buffer,
1200 unsigned int buffer_size,
1201 unsigned int buffer_offset)
1203 NumericT inner_prod_ApAp = 0;
1204 NumericT inner_prod_pAp = 0;
1205 NumericT inner_prod_r0Ap = 0;
1206 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1207 unsigned int glb_sz = gridDim.x * blockDim.x;
1213 unsigned int offset =
row;
1214 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1216 NumericT val = ell_elements[offset];
1218 sum += val ? p[ell_coords[offset]] * val : NumericT(0);
1221 unsigned int col_begin = csr_rows[
row];
1222 unsigned int col_end = csr_rows[
row + 1];
1224 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1226 sum += p[csr_cols[item_id]] * csr_elements[item_id];
1230 inner_prod_ApAp += sum *
sum;
1231 inner_prod_pAp += sum * p[
row];
1232 inner_prod_r0Ap += sum * r0star[
row];
1236 __shared__ NumericT shared_array_ApAp[256];
1237 __shared__ NumericT shared_array_pAp[256];
1238 __shared__ NumericT shared_array_r0Ap[256];
1239 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1240 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1241 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1245 if (threadIdx.x <
stride)
1247 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1248 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1249 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1254 if (threadIdx.x == 0) {
1255 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1256 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1257 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1263 template<
typename NumericT>
1273 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1274 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1276 pipelined_bicgstab_hyb_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1277 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1278 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1279 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
1280 detail::cuda_arg<NumericT>(A.
handle5().cuda_handle()),
1282 static_cast<unsigned int>(A.
ell_nnz()),
1283 detail::cuda_arg<NumericT>(p),
1284 detail::cuda_arg<NumericT>(Ap),
1285 detail::cuda_arg<NumericT>(r0star),
1287 detail::cuda_arg<NumericT>(inner_prod_buffer),
1295 template <
typename T>
1297 unsigned int vk_offset,
1300 unsigned int R_offset,
1301 T
const * inner_prod_buffer,
1302 unsigned int chunk_size,
1303 T * r_dot_vk_buffer,
1304 unsigned int chunk_offset,
1307 __shared__ T shared_array[128];
1311 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1315 if (threadIdx.x <
stride)
1316 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1321 norm_vk = sqrt(shared_array[0]);
1323 T inner_prod_contrib = 0;
1324 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1325 T value_vk = vk[i + vk_offset] / norm_vk;
1327 inner_prod_contrib += residual[i] * value_vk;
1329 vk[i + vk_offset] = value_vk;
1334 shared_array[threadIdx.x] = inner_prod_contrib;
1338 if (threadIdx.x <
stride)
1339 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1343 if (threadIdx.x == 0)
1344 r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1346 if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1347 R_buffer[R_offset] = norm_vk;
1357 template <
typename T>
1368 unsigned int R_offset = offset_in_R;
1369 unsigned int chunk_size = buffer_chunk_size;
1370 unsigned int chunk_offset = buffer_chunk_offset;
1373 pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(detail::cuda_arg<T>(v_k),
1375 detail::cuda_arg<T>(residual),
1376 detail::cuda_arg<T>(R_buffer),
1378 detail::cuda_arg<T>(inner_prod_buffer),
1380 detail::cuda_arg<T>(r_dot_vk_buffer),
1388 template <
typename T>
1393 T * vi_in_vk_buffer,
1394 unsigned int chunk_size)
1396 __shared__ T shared_array[7*128];
1400 unsigned int k_base = 0;
1403 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1411 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1414 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1415 vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) *
internal_size];
1419 for (uint j=0; j<vecs_in_iteration; ++j)
1420 shared_array[threadIdx.x + j*chunk_size] = vi_in_vk[j];
1424 if (threadIdx.x <
stride) {
1425 for (uint j=0; j<vecs_in_iteration; ++j)
1426 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1431 if (threadIdx.x == 0)
1432 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1433 vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1435 k_base += vecs_in_iteration;
1440 template <
typename T>
1448 unsigned int chunk_size = buffer_chunk_size;
1449 unsigned int size = v_k_size;
1451 unsigned int k = param_k;
1453 pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1457 detail::cuda_arg<T>(vi_in_vk_buffer),
1465 template <
typename T>
1470 T
const * vi_in_vk_buffer,
1471 unsigned int chunk_size,
1473 unsigned int krylov_dim,
1474 T * inner_prod_buffer)
1476 __shared__ T shared_array[7*128];
1480 unsigned int k_base = 0;
1483 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1486 for (uint j=0; j<vecs_in_iteration; ++j)
1487 shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1491 if (threadIdx.x <
stride) {
1492 for (uint j=0; j<vecs_in_iteration; ++j)
1493 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1499 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1503 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1504 value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) *
internal_size];
1505 vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1510 if (blockIdx.x == 0)
1511 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1512 R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1515 k_base += vecs_in_iteration;
1519 shared_array[threadIdx.x] = vk_dot_vk;
1523 if (threadIdx.x <
stride)
1524 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1528 if (threadIdx.x == 0)
1529 inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1532 template <
typename T>
1543 unsigned int chunk_size = buffer_chunk_size;
1544 unsigned int size = v_k_size;
1546 unsigned int k = param_k;
1547 unsigned int krylov = krylov_dim;
1549 pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1553 detail::cuda_arg<T>(vi_in_vk_buffer),
1555 detail::cuda_arg<T>(R_buffer),
1557 detail::cuda_arg<T>(inner_prod_buffer));
1564 template <
typename T>
1567 T
const * krylov_basis,
1570 T
const * coefficients,
1573 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1575 T value_result = result[i] + coefficients[0] * residual[i];
1577 for (
unsigned int j = 1; j < k; ++j)
1578 value_result += coefficients[j] * krylov_basis[i + (j-1)*
internal_size];
1580 result[i] = value_result;
1584 template <
typename T>
1593 unsigned int size = v_k_size;
1595 unsigned int k = param_k;
1597 pipelined_gmres_update_result_kernel<<<128, 128>>>(detail::cuda_arg<T>(result),
1598 detail::cuda_arg<T>(residual),
1599 detail::cuda_arg<T>(krylov_basis),
1602 detail::cuda_arg<T>(coefficients),
1609 template <
typename T>
1616 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1618 pipelined_cg_csr_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
1619 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1620 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1624 detail::cuda_arg<T>(inner_prod_buffer),
1625 buffer_size_per_vector);
1629 template <
typename T>
1636 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1640 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
1641 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1642 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1646 detail::cuda_arg<T>(inner_prod_buffer),
1647 buffer_size_per_vector);
1651 template <
typename T>
1658 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1660 pipelined_cg_ell_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1661 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1663 static_cast<unsigned int>(A.
maxnnz()),
1667 detail::cuda_arg<T>(inner_prod_buffer),
1668 buffer_size_per_vector);
1672 template <
typename T>
1679 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1682 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1683 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1684 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1688 detail::cuda_arg<T>(inner_prod_buffer),
1689 buffer_size_per_vector);
1694 template <
typename T>
1701 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1703 pipelined_cg_hyb_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1704 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1705 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1706 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
1707 detail::cuda_arg<T>(A.
handle5().cuda_handle()),
1709 static_cast<unsigned int>(A.
ell_nnz()),
1713 detail::cuda_arg<T>(inner_prod_buffer),
1714 buffer_size_per_vector);
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
const handle_type & handle() const
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
vcl_size_t internal_size1() const
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, .
result_of::size_type< T >::type start(T const &obj)
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
__global__ void pipelined_bicgstab_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
size_type size() const
Returns the length of the vector (cf. std::vector)
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
const handle_type & handle5() const
Implementation of the ViennaCL scalar class.
__global__ void pipelined_cg_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...