ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
bisect_kernel_large.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
31 /* Determine eigenvalues for large symmetric, tridiagonal matrix. First
32  step of the computation. */
33 
34 // includes, project
37 // additional kernel
39 
40 // declaration, forward
41 
42 namespace viennacl
43 {
44 namespace linalg
45 {
46 namespace cuda
47 {
51 template<typename NumericT>
52 __device__
53 void writeToGmem(const unsigned int tid, const unsigned int tid_2,
54  const unsigned int num_threads_active,
55  const unsigned int num_blocks_mult,
56  NumericT *g_left_one, NumericT *g_right_one,
57  unsigned int *g_pos_one,
58  NumericT *g_left_mult, NumericT *g_right_mult,
59  unsigned int *g_left_count_mult,
60  unsigned int *g_right_count_mult,
61  NumericT *s_left, NumericT *s_right,
62  unsigned short *s_left_count, unsigned short *s_right_count,
63  unsigned int *g_blocks_mult,
64  unsigned int *g_blocks_mult_sum,
65  unsigned short *s_compaction_list,
66  unsigned short *s_cl_helper,
67  unsigned int offset_mult_lambda
68  )
69 {
70 
71  if (tid < offset_mult_lambda)
72  {
73 
74  g_left_one[tid] = s_left[tid];
75  g_right_one[tid] = s_right[tid];
76  // right count can be used to order eigenvalues without sorting
77  g_pos_one[tid] = s_right_count[tid];
78  }
79  else
80  {
81 
82 
83  g_left_mult[tid - offset_mult_lambda] = s_left[tid];
84  g_right_mult[tid - offset_mult_lambda] = s_right[tid];
85  g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
86  g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
87  }
88 
89  if (tid_2 < num_threads_active)
90  {
91 
92  if (tid_2 < offset_mult_lambda)
93  {
94 
95  g_left_one[tid_2] = s_left[tid_2];
96  g_right_one[tid_2] = s_right[tid_2];
97  // right count can be used to order eigenvalues without sorting
98  g_pos_one[tid_2] = s_right_count[tid_2];
99  }
100  else
101  {
102 
103  g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
104  g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
105  g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
106  g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
107  }
108 
109  } // end writing out data
110 
111  __syncthreads();
112 
113  // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
114  // s_compaction_list we write the exclusive scan result
115  if (tid <= num_blocks_mult)
116  {
117  g_blocks_mult[tid] = s_compaction_list[tid];
118  g_blocks_mult_sum[tid] = s_cl_helper[tid];
119  }
120 
121  if (tid_2 <= num_blocks_mult)
122  {
123  g_blocks_mult[tid_2] = s_compaction_list[tid_2];
124  g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
125  }
126 }
127 
131 template<typename NumericT>
132 __device__
133 void
134 compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
135  const unsigned int num_threads_active,
136  unsigned int &offset_mult_lambda,
137  NumericT *s_left, NumericT *s_right,
138  unsigned short *s_left_count, unsigned short *s_right_count,
139  unsigned short *s_cl_one, unsigned short *s_cl_mult,
140  unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
141  unsigned int is_one_lambda, unsigned int is_one_lambda_2,
142  NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2,
143  unsigned int &left_count, unsigned int &right_count,
144  unsigned int &left_count_2, unsigned int &right_count_2,
145  unsigned int c_block_iend, unsigned int c_sum_block,
146  unsigned int c_block_iend_2, unsigned int c_sum_block_2
147  )
148 {
149  // cache data before performing compaction
150  left = s_left[tid];
151  right = s_right[tid];
152 
153  if (tid_2 < num_threads_active)
154  {
155 
156  left_2 = s_left[tid_2];
157  right_2 = s_right[tid_2];
158  }
159 
160  __syncthreads();
161 
162  // determine addresses for intervals containing multiple eigenvalues and
163  // addresses for blocks of intervals
164  unsigned int ptr_w = 0;
165  unsigned int ptr_w_2 = 0;
166  unsigned int ptr_blocking_w = 0;
167  unsigned int ptr_blocking_w_2 = 0;
168 
169 
170 
171  ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
172  : s_cl_mult[tid] + offset_mult_lambda;
173 
174  if (0 != c_block_iend)
175  {
176  ptr_blocking_w = s_cl_blocking[tid];
177  }
178 
179  if (tid_2 < num_threads_active)
180  {
181  ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
182  : s_cl_mult[tid_2] + offset_mult_lambda;
183 
184  if (0 != c_block_iend_2)
185  {
186  ptr_blocking_w_2 = s_cl_blocking[tid_2];
187  }
188  }
189 
190 
191  __syncthreads();
192 
193  // store compactly in shared mem
194  s_left[ptr_w] = left;
195  s_right[ptr_w] = right;
196  s_left_count[ptr_w] = left_count;
197  s_right_count[ptr_w] = right_count;
198 
199 
200 
201  __syncthreads();
202  if(tid == 1)
203  {
204  s_left[ptr_w] = left;
205  s_right[ptr_w] = right;
206  s_left_count[ptr_w] = left_count;
207  s_right_count[ptr_w] = right_count;
208  }
209  if (0 != c_block_iend)
210  {
211  s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
212  s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
213  }
214 
215  if (tid_2 < num_threads_active)
216  {
217 
218  // store compactly in shared mem
219  s_left[ptr_w_2] = left_2;
220  s_right[ptr_w_2] = right_2;
221  s_left_count[ptr_w_2] = left_count_2;
222  s_right_count[ptr_w_2] = right_count_2;
223 
224  if (0 != c_block_iend_2)
225  {
226  s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
227  s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
228  }
229  }
230 
231 }
232 
236 inline __device__
237 void
238 scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
239  const unsigned int num_threads_compaction,
240  unsigned short *s_cl_blocking,
241  unsigned short *s_cl_helper
242  )
243 {
244  // prepare for second step of block generation: compaction of the block
245  // list itself to efficiently write out these
246  s_cl_blocking[tid] = s_cl_helper[tid];
247 
248  if (tid_2 < num_threads_compaction)
249  {
250  s_cl_blocking[tid_2] = s_cl_helper[tid_2];
251  }
252 
253  __syncthreads();
254 
255  // additional scan to compact s_cl_blocking that permits to generate a
256  // compact list of eigenvalue blocks each one containing about
257  // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
258  // processed by one thread block in a subsequent processing step
259 
260  unsigned int offset = 1;
261 
262  // build scan tree
263  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
264  {
265 
266  __syncthreads();
267 
268  if (tid < d)
269  {
270 
271  unsigned int ai = offset*(2*tid+1)-1;
272  unsigned int bi = offset*(2*tid+2)-1;
273  s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
274  }
275 
276  offset <<= 1;
277  }
278 
279  // traverse down tree: first down to level 2 across
280  for (int d = 2; d < num_threads_compaction; d <<= 1)
281  {
282 
283  offset >>= 1;
284  __syncthreads();
285 
286  //
287  if (tid < (d-1))
288  {
289 
290  unsigned int ai = offset*(tid+1) - 1;
291  unsigned int bi = ai + (offset >> 1);
292  s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
293  }
294  }
295 
296 }
297 
301 inline __device__
302 void
303 scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
304  const unsigned int num_threads_active,
305  const unsigned int num_threads_compaction,
306  unsigned short *s_cl_blocking,
307  unsigned short *s_cl_helper)
308 {
309  unsigned int offset = 1;
310 
311  // first step of scan to build the sum of elements within each block
312  // build up tree
313  for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
314  {
315 
316  __syncthreads();
317 
318  if (tid < d)
319  {
320 
321  unsigned int ai = offset*(2*tid+1)-1;
322  unsigned int bi = offset*(2*tid+2)-1;
323 
324  s_cl_blocking[bi] += s_cl_blocking[ai];
325  }
326 
327  offset *= 2;
328  }
329 
330  // first step of scan to build the sum of elements within each block
331  // traverse down tree
332  for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
333  {
334 
335  offset >>= 1;
336  __syncthreads();
337 
338  if (tid < (d-1))
339  {
340 
341  unsigned int ai = offset*(tid+1) - 1;
342  unsigned int bi = ai + (offset >> 1);
343 
344  s_cl_blocking[bi] += s_cl_blocking[ai];
345  }
346  }
347 
348  __syncthreads();
349 
350  if (0 == tid)
351  {
352 
353  // move last element of scan to last element that is valid
354  // necessary because the number of threads employed for scan is a power
355  // of two and not necessarily the number of active threasd
356  s_cl_helper[num_threads_active - 1] =
357  s_cl_helper[num_threads_compaction - 1];
358  s_cl_blocking[num_threads_active - 1] =
359  s_cl_blocking[num_threads_compaction - 1];
360  }
361 }
362 
367 inline __device__
368 void
369 scanInitial(const unsigned int tid, const unsigned int tid_2,
370  const unsigned int num_threads_active,
371  const unsigned int num_threads_compaction,
372  unsigned short *s_cl_one, unsigned short *s_cl_mult,
373  unsigned short *s_cl_blocking, unsigned short *s_cl_helper
374  )
375 {
376 
377  // perform scan to compactly write out the intervals containing one and
378  // multiple eigenvalues
379  // also generate tree for blocking of intervals containing multiple
380  // eigenvalues
381 
382  unsigned int offset = 1;
383 
384  // build scan tree
385  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
386  {
387 
388  __syncthreads();
389 
390  if (tid < d)
391  {
392 
393  unsigned int ai = offset*(2*tid+1);
394  unsigned int bi = offset*(2*tid+2)-1;
395 
396  s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
397  s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
398 
399  // s_cl_helper is binary and zero for an internal node and 1 for a
400  // root node of a tree corresponding to a block
401  // s_cl_blocking contains the number of nodes in each sub-tree at each
402  // iteration, the data has to be kept to compute the total number of
403  // eigenvalues per block that, in turn, is needed to efficiently
404  // write out data in the second step
405  if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
406  {
407 
408  // check how many childs are non terminated
409  if (s_cl_helper[ai - 1] == 1)
410  {
411  // mark as terminated
412  s_cl_helper[bi] = 1;
413  }
414  else if (s_cl_helper[bi] == 1)
415  {
416  // mark as terminated
417  s_cl_helper[ai - 1] = 1;
418  }
419  else // both childs are non-terminated
420  {
421 
422  unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
423 
425  {
426 
427  // the two child trees have to form separate blocks, terminate trees
428  s_cl_helper[ai - 1] = 1;
429  s_cl_helper[bi] = 1;
430  }
431  else
432  {
433  // build up tree by joining subtrees
434  s_cl_blocking[bi] = temp;
435  s_cl_blocking[ai - 1] = 0;
436  }
437  }
438  } // end s_cl_helper update
439 
440  }
441 
442  offset <<= 1;
443  }
444 
445 
446  // traverse down tree, this only for stream compaction, not for block
447  // construction
448  for (int d = 2; d < num_threads_compaction; d <<= 1)
449  {
450 
451  offset >>= 1;
452  __syncthreads();
453 
454  //
455  if (tid < (d-1))
456  {
457 
458  unsigned int ai = offset*(tid+1) - 1;
459  unsigned int bi = ai + (offset >> 1);
460 
461  s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
462  s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
463  }
464  }
465 
466 }
467 
472 template<typename NumericT>
473 __device__
474 void
475 storeNonEmptyIntervalsLarge(unsigned int addr,
476  const unsigned int num_threads_active,
477  NumericT *s_left, NumericT *s_right,
478  unsigned short *s_left_count,
479  unsigned short *s_right_count,
480  NumericT left, NumericT mid, NumericT right,
481  const unsigned short left_count,
482  const unsigned short mid_count,
483  const unsigned short right_count,
484  NumericT epsilon,
485  unsigned int &compact_second_chunk,
486  unsigned short *s_compaction_list,
487  unsigned int &is_active_second)
488 {
489  // check if both child intervals are valid
490  if ((left_count != mid_count) && (mid_count != right_count))
491  {
492 
493  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
494  left, mid, left_count, mid_count, epsilon);
495 
496  is_active_second = 1;
497  s_compaction_list[threadIdx.x] = 1;
498  compact_second_chunk = 1;
499  }
500  else
501  {
502 
503  // only one non-empty child interval
504 
505  // mark that no second child
506  is_active_second = 0;
507  s_compaction_list[threadIdx.x] = 0;
508 
509  // store the one valid child interval
510  if (left_count != mid_count)
511  {
512  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
513  left, mid, left_count, mid_count, epsilon);
514  }
515  else
516  {
517  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
518  mid, right, mid_count, right_count, epsilon);
519  }
520  }
521 }
522 
533 template<typename NumericT>
534 __global__
535 void
536 bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
537  const NumericT lg, const NumericT ug,
538  const unsigned int lg_eig_count,
539  const unsigned int ug_eig_count,
540  NumericT epsilon,
541  unsigned int *g_num_one,
542  unsigned int *g_num_blocks_mult,
543  NumericT *g_left_one, NumericT *g_right_one,
544  unsigned int *g_pos_one,
545  NumericT *g_left_mult, NumericT *g_right_mult,
546  unsigned int *g_left_count_mult,
547  unsigned int *g_right_count_mult,
548  unsigned int *g_blocks_mult,
549  unsigned int *g_blocks_mult_sum
550  )
551 {
552  const unsigned int tid = threadIdx.x;
553 
554  // intervals (store left and right because the subdivision tree is in general
555  // not dense
556  __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
557  __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
558 
559  // number of eigenvalues that are smaller than s_left / s_right
560  // (correspondence is realized via indices)
561  __shared__ unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
562  __shared__ unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
563 
564  // helper for stream compaction
565  __shared__ unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
566 
567  // state variables for whole block
568  // if 0 then compaction of second chunk of child intervals is not necessary
569  // (because all intervals had exactly one non-dead child)
570  __shared__ unsigned int compact_second_chunk;
571  // if 1 then all threads are converged
572  __shared__ unsigned int all_threads_converged;
573 
574  // number of currently active threads
575  __shared__ unsigned int num_threads_active;
576 
577  // number of threads to use for stream compaction
578  __shared__ unsigned int num_threads_compaction;
579 
580  // helper for exclusive scan
581  unsigned short *s_compaction_list_exc = s_compaction_list + 1;
582 
583 
584  // variables for currently processed interval
585  // left and right limit of active interval
586  NumericT left = 0.0f;
587  NumericT right = 0.0f;
588  unsigned int left_count = 0;
589  unsigned int right_count = 0;
590  // midpoint of active interval
591  NumericT mid = 0.0f;
592  // number of eigenvalues smaller then mid
593  unsigned int mid_count = 0;
594  // helper for stream compaction (tracking of threads generating second child)
595  unsigned int is_active_second = 0;
596 
597  // initialize lists
598  s_compaction_list[tid] = 0;
599  s_left[tid] = 0;
600  s_right[tid] = 0;
601  s_left_count[tid] = 0;
602  s_right_count[tid] = 0;
603 
604  __syncthreads();
605 
606  // set up initial configuration
607  if (0 == tid)
608  {
609 
610  s_left[0] = lg;
611  s_right[0] = ug;
612  s_left_count[0] = lg_eig_count;
613  s_right_count[0] = ug_eig_count;
614 
615  compact_second_chunk = 0;
616  num_threads_active = 1;
617 
618  num_threads_compaction = 1;
619 
620  all_threads_converged = 1;
621  }
622 
623  __syncthreads();
624 
625  // for all active threads read intervals from the last level
626  // the number of (worst case) active threads per level l is 2^l
627  // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
628  // will be executed in the second step
629  for( unsigned int i = 0; i < 15; ++i )
630  {
631  s_compaction_list[tid] = 0;
632  s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
633  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
634  subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
635  num_threads_active,
636  left, right, left_count, right_count,
637  mid, all_threads_converged);
638 
639  __syncthreads();
640 
641  // check if done
642  if (1 == all_threads_converged)
643  {
644  break;
645  }
646 
647  // compute number of eigenvalues smaller than mid
648  // use all threads for reading the necessary matrix data from global
649  // memory
650  // use s_left and s_right as scratch space for diagonal and
651  // superdiagonal of matrix
652  mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
653  mid, threadIdx.x,
654  num_threads_active,
655  s_left, s_right,
656  (left == right));
657 
658  __syncthreads();
659 
660  // store intervals
661  // for all threads store the first child interval in a continuous chunk of
662  // memory, and the second child interval -- if it exists -- in a second
663  // chunk; it is likely that all threads reach convergence up to
664  // \a epsilon at the same level; furthermore, for higher level most / all
665  // threads will have only one child, storing the first child compactly will
666  // (first) avoid to perform a compaction step on the first chunk, (second)
667  // make it for higher levels (when all threads / intervals have
668  // exactly one child) unnecessary to perform a compaction of the second
669  // chunk
670  if (tid < num_threads_active)
671  {
672 
673  if (left != right)
674  {
675 
676  // store intervals
677  storeNonEmptyIntervalsLarge(tid, num_threads_active,
678  s_left, s_right,
679  s_left_count, s_right_count,
680  left, mid, right,
681  left_count, mid_count, right_count,
682  epsilon, compact_second_chunk,
683  s_compaction_list_exc,
684  is_active_second);
685  }
686  else
687  {
688 
689  // re-write converged interval (has to be stored again because s_left
690  // and s_right are used as scratch space for
691  // computeNumSmallerEigenvalsLarge()
692  s_left[tid] = left;
693  s_right[tid] = left;
694  s_left_count[tid] = left_count;
695  s_right_count[tid] = right_count;
696 
697  is_active_second = 0;
698  }
699  }
700 
701  // necessary so that compact_second_chunk is up-to-date
702  __syncthreads();
703 
704  // perform compaction of chunk where second children are stored
705  // scan of (num_threads_active / 2) elements, thus at most
706  // (num_threads_active / 4) threads are needed
707  if (compact_second_chunk > 0)
708  {
709 
710  // create indices for compaction
711  createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
712  }
713  __syncthreads();
714 
715  if (compact_second_chunk > 0)
716  {
717  compactIntervals(s_left, s_right, s_left_count, s_right_count,
718  mid, right, mid_count, right_count,
719  s_compaction_list, num_threads_active,
720  is_active_second);
721  }
722 
723  __syncthreads();
724 
725  // update state variables
726  if (0 == tid)
727  {
728 
729  // update number of active threads with result of reduction
730  num_threads_active += s_compaction_list[num_threads_active];
731  num_threads_compaction = ceilPow2(num_threads_active);
732 
733  compact_second_chunk = 0;
734  all_threads_converged = 1;
735  }
736 
737  __syncthreads();
738 
739  if (num_threads_compaction > blockDim.x)
740  {
741  break;
742  }
743 
744  }
745 
746  __syncthreads();
747 
748  // generate two lists of intervals; one with intervals that contain one
749  // eigenvalue (or are converged), and one with intervals that need further
750  // subdivision
751 
752  // perform two scans in parallel
753 
754  unsigned int left_count_2;
755  unsigned int right_count_2;
756 
757  unsigned int tid_2 = tid + blockDim.x;
758 
759  // cache in per thread registers so that s_left_count and s_right_count
760  // can be used for scans
761  left_count = s_left_count[tid];
762  right_count = s_right_count[tid];
763 
764  // some threads have to cache data for two intervals
765  if (tid_2 < num_threads_active)
766  {
767  left_count_2 = s_left_count[tid_2];
768  right_count_2 = s_right_count[tid_2];
769  }
770 
771  // compaction list for intervals containing one and multiple eigenvalues
772  // do not affect first element for exclusive scan
773  unsigned short *s_cl_one = s_left_count + 1;
774  unsigned short *s_cl_mult = s_right_count + 1;
775 
776  // compaction list for generating blocks of intervals containing multiple
777  // eigenvalues
778  unsigned short *s_cl_blocking = s_compaction_list_exc;
779  // helper compaction list for generating blocks of intervals
780  __shared__ unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
781 
782  if (0 == tid)
783  {
784  // set to 0 for exclusive scan
785  s_left_count[0] = 0;
786  s_right_count[0] = 0;
787 
788  }
789 
790  __syncthreads();
791 
792  // flag if interval contains one or multiple eigenvalues
793  unsigned int is_one_lambda = 0;
794  unsigned int is_one_lambda_2 = 0;
795 
796  // number of eigenvalues in the interval
797  unsigned int multiplicity = right_count - left_count;
798  is_one_lambda = (1 == multiplicity);
799 
800  s_cl_one[tid] = is_one_lambda;
801  s_cl_mult[tid] = (! is_one_lambda);
802 
803  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
804  s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
805  s_cl_helper[tid] = 0;
806 
807  if (tid_2 < num_threads_active)
808  {
809 
810  unsigned int multiplicity = right_count_2 - left_count_2;
811  is_one_lambda_2 = (1 == multiplicity);
812 
813  s_cl_one[tid_2] = is_one_lambda_2;
814  s_cl_mult[tid_2] = (! is_one_lambda_2);
815 
816  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
817  s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
818  s_cl_helper[tid_2] = 0;
819  }
820  else if (tid_2 < (2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1))
821  {
822 
823  // clear
824  s_cl_blocking[tid_2] = 0;
825  s_cl_helper[tid_2] = 0;
826  }
827 
828 
829  scanInitial(tid, tid_2, num_threads_active, num_threads_compaction,
830  s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
831 
832  __syncthreads();
833 
834  scanSumBlocks(tid, tid_2, num_threads_active,
835  num_threads_compaction, s_cl_blocking, s_cl_helper);
836 
837  // end down sweep of scan
838  __syncthreads();
839 
840  unsigned int c_block_iend = 0;
841  unsigned int c_block_iend_2 = 0;
842  unsigned int c_sum_block = 0;
843  unsigned int c_sum_block_2 = 0;
844 
845  // for each thread / interval that corresponds to root node of interval block
846  // store start address of block and total number of eigenvalues in all blocks
847  // before this block (particular thread is irrelevant, constraint is to
848  // have a subset of threads so that one and only one of them is in each
849  // interval)
850  if (1 == s_cl_helper[tid])
851  {
852 
853  c_block_iend = s_cl_mult[tid] + 1;
854  c_sum_block = s_cl_blocking[tid];
855  }
856 
857  if (1 == s_cl_helper[tid_2])
858  {
859 
860  c_block_iend_2 = s_cl_mult[tid_2] + 1;
861  c_sum_block_2 = s_cl_blocking[tid_2];
862  }
863 
864  scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
865  s_cl_blocking, s_cl_helper);
866 
867 
868  // finished second scan for s_cl_blocking
869  __syncthreads();
870 
871  // determine the global results
872  __shared__ unsigned int num_blocks_mult;
873  __shared__ unsigned int num_mult;
874  __shared__ unsigned int offset_mult_lambda;
875 
876  if (0 == tid)
877  {
878 
879  num_blocks_mult = s_cl_blocking[num_threads_active - 1];
880  offset_mult_lambda = s_cl_one[num_threads_active - 1];
881  num_mult = s_cl_mult[num_threads_active - 1];
882 
883  *g_num_one = offset_mult_lambda;
884  *g_num_blocks_mult = num_blocks_mult;
885  }
886 
887  __syncthreads();
888 
889  NumericT left_2, right_2;
890  --s_cl_one;
891  --s_cl_mult;
892  --s_cl_blocking;
893 
894  __syncthreads();
895  compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
896  s_left, s_right, s_left_count, s_right_count,
897  s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
898  is_one_lambda, is_one_lambda_2,
899  left, right, left_2, right_2,
900  left_count, right_count, left_count_2, right_count_2,
901  c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
902  );
903 
904  __syncthreads();
905 
906  // final adjustment before writing out data to global memory
907  if (0 == tid)
908  {
909  s_cl_blocking[num_blocks_mult] = num_mult;
910  s_cl_helper[0] = 0;
911  }
912 
913  __syncthreads();
914 
915  // write to global memory
916  writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
917  g_left_one, g_right_one, g_pos_one,
918  g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
919  s_left, s_right, s_left_count, s_right_count,
920  g_blocks_mult, g_blocks_mult_sum,
921  s_compaction_list, s_cl_helper, offset_mult_lambda);
922 
923 }
924 }
925 }
926 }
927 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
__device__ void writeToGmem(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, unsigned short *s_compaction_list, unsigned short *s_cl_helper, unsigned int offset_mult_lambda)
Write data to global memory.
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
Definition: config.hpp:31
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Utility functions.
__device__ int ceilPow2(int n)
Definition: bisect_util.hpp:66
__global__ void bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon, unsigned int *g_num_one, unsigned int *g_num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix g_d diagonal elements in g...
__device__ void scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Compute addresses to obtain compact list of block start addresses.
Global configuration parameters.
__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, NumericT left, NumericT mid, NumericT right, const unsigned short left_count, const unsigned short mid_count, const unsigned short right_count, NumericT epsilon, unsigned int &compact_second_chunk, unsigned short *s_compaction_list, unsigned int &is_active_second)
__device__ void compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, unsigned int &offset_mult_lambda, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper, unsigned int is_one_lambda, unsigned int is_one_lambda_2, NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, unsigned int &left_count, unsigned int &right_count, unsigned int &left_count_2, unsigned int &right_count_2, unsigned int c_block_iend, unsigned int c_sum_block, unsigned int c_block_iend_2, unsigned int c_sum_block_2)
Perform final stream compaction before writing data to global memory.
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Perform scan to obtain number of eigenvalues before a specific block.
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)