ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
bisect.hpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2014, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 
28 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
29 #define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
30 
31 
32 // declaration, forward
33 
34 namespace viennacl
35 {
36 namespace linalg
37 {
38 namespace opencl
39 {
40 namespace kernels
41 {
42  template <typename StringType>
43  void generate_bisect_kernel_config(StringType & source)
44  {
45  /* Global configuration parameter */
46  source.append(" #define MAX_THREADS_BLOCK 128\n");
47  source.append(" #define MAX_SMALL_MATRIX 512\n");
48  source.append(" #define MAX_THREADS_BLOCK_SMALL_MATRIX 512\n");
49  source.append(" #define MIN_ABS_INTERVAL 5.0e-37\n");
50 
51  }
52 
57  template <typename StringType>
58  void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
59  {
60  source.append(" \n");
61  source.append(" inline int \n");
62  source.append(" floorPow2(int n) \n");
63  source.append(" { \n");
64  source.append(" uint glb_id = get_global_id(0); \n");
65  source.append(" uint grp_id = get_group_id(0); \n");
66  source.append(" uint grp_nm = get_num_groups(0); \n");
67  source.append(" uint lcl_id = get_local_id(0); \n");
68  source.append(" uint lcl_sz = get_local_size(0); \n");
69 
70  // early out if already power of two
71  source.append(" if (0 == (n & (n-1))) \n");
72  source.append(" { \n");
73  source.append(" return n; \n");
74  source.append(" } \n");
75 
76  source.append(" int exp; \n");
77  source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
78  source.append(" return (1 << (exp - 1)); \n");
79  source.append(" } \n");
80 
81  }
82 
83 
88  template <typename StringType>
89  void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
90  {
91  source.append(" \n");
92  source.append(" inline int \n");
93  source.append(" ceilPow2(int n) \n");
94  source.append(" { \n");
95  source.append(" uint glb_id = get_global_id(0); \n");
96  source.append(" uint grp_id = get_group_id(0); \n");
97  source.append(" uint grp_nm = get_num_groups(0); \n");
98  source.append(" uint lcl_id = get_local_id(0); \n");
99  source.append(" uint lcl_sz = get_local_size(0); \n");
100 
101 
102  // early out if already power of two
103  source.append(" if (0 == (n & (n-1))) \n");
104  source.append(" { \n");
105  source.append(" return n; \n");
106  source.append(" } \n");
107 
108  source.append(" int exp; \n");
109  source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
110  source.append(" return (1 << exp); \n");
111  source.append(" } \n");
112  }
113 
114 
120  template <typename StringType>
121  void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
122  {
123  source.append(" \n");
124  source.append(" inline "); source.append(numeric_string); source.append(" \n");
125  source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
126  source.append(" const "); source.append(numeric_string); source.append(" right) \n");
127  source.append(" { \n");
128  source.append(" uint glb_id = get_global_id(0); \n");
129  source.append(" uint grp_id = get_group_id(0); \n");
130  source.append(" uint grp_nm = get_num_groups(0); \n");
131  source.append(" uint lcl_id = get_local_id(0); \n");
132  source.append(" uint lcl_sz = get_local_size(0); \n");
133  source.append(" "); source.append(numeric_string); source.append(" mid; \n");
134 
135  source.append(" if (sign(left) == sign(right)) \n");
136  source.append(" { \n");
137  source.append(" mid = left + (right - left) * 0.5f; \n");
138  source.append(" } \n");
139  source.append(" else \n");
140  source.append(" { \n");
141  source.append(" mid = (left + right) * 0.5f; \n");
142  source.append(" } \n");
143 
144  source.append(" return mid; \n");
145  source.append(" } \n");
146 
147  }
148 
149 
163  template<typename StringType>
164  void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
165  {
166  source.append(" \n");
167  source.append(" void \n");
168  source.append(" storeInterval(unsigned int addr, \n");
169  source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
170  source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
171  source.append(" __local unsigned int * s_left_count, \n");
172  source.append(" __local unsigned int * s_right_count, \n");
173  source.append(" "); source.append(numeric_string); source.append(" left, \n");
174  source.append(" "); source.append(numeric_string); source.append(" right, \n");
175  source.append(" unsigned int left_count, \n");
176  source.append(" unsigned int right_count, \n");
177  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
178  source.append(" { \n");
179  source.append(" uint glb_id = get_global_id(0); \n");
180  source.append(" uint grp_id = get_group_id(0); \n");
181  source.append(" uint grp_nm = get_num_groups(0); \n");
182  source.append(" uint lcl_id = get_local_id(0); \n");
183  source.append(" uint lcl_sz = get_local_size(0); \n");
184 
185  source.append(" s_left_count[addr] = left_count; \n");
186  source.append(" s_right_count[addr] = right_count; \n");
187 
188  // check if interval converged
189  source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
190  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
191 
192  source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )MIN_ABS_INTERVAL, t1)) \n");
193  source.append(" { \n");
194  // compute mid point
195  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
196 
197  // mark as converged
198  source.append(" s_left[addr] = lambda; \n");
199  source.append(" s_right[addr] = lambda; \n");
200  source.append(" } \n");
201  source.append(" else \n");
202  source.append(" { \n");
203 
204  // store current limits
205  source.append(" s_left[addr] = left; \n");
206  source.append(" s_right[addr] = right; \n");
207  source.append(" } \n");
208 
209  source.append(" } \n");
210 
211  }
212 
213  template<typename StringType>
214  void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
215  {
216  source.append(" \n");
217  source.append(" void \n");
218  source.append(" storeIntervalShort(unsigned int addr, \n");
219  source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
220  source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
221  source.append(" __local unsigned short * s_left_count, \n");
222  source.append(" __local unsigned short * s_right_count, \n");
223  source.append(" "); source.append(numeric_string); source.append(" left, \n");
224  source.append(" "); source.append(numeric_string); source.append(" right, \n");
225  source.append(" unsigned int left_count, \n");
226  source.append(" unsigned int right_count, \n");
227  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
228  source.append(" { \n");
229  source.append(" uint glb_id = get_global_id(0); \n");
230  source.append(" uint grp_id = get_group_id(0); \n");
231  source.append(" uint grp_nm = get_num_groups(0); \n");
232  source.append(" uint lcl_id = get_local_id(0); \n");
233  source.append(" uint lcl_sz = get_local_size(0); \n");
234 
235  source.append(" s_left_count[addr] = left_count; \n");
236  source.append(" s_right_count[addr] = right_count; \n");
237 
238  // check if interval converged
239  source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
240  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
241 
242  source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )MIN_ABS_INTERVAL, t1)) \n");
243  source.append(" { \n");
244  // compute mid point
245  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
246 
247  // mark as converged
248  source.append(" s_left[addr] = lambda; \n");
249  source.append(" s_right[addr] = lambda; \n");
250  source.append(" } \n");
251  source.append(" else \n");
252  source.append(" { \n");
253 
254  // store current limits
255  source.append(" s_left[addr] = left; \n");
256  source.append(" s_right[addr] = right; \n");
257  source.append(" } \n");
258 
259  source.append(" } \n");
260 
261 
262  }
263 
264 
277  template <typename StringType>
278  void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
279  {
280  source.append(" \n");
281  source.append(" inline unsigned int \n");
282  source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
283  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
284  source.append(" const unsigned int n, \n");
285  source.append(" const "); source.append(numeric_string); source.append(" x, \n");
286  source.append(" const unsigned int tid, \n");
287  source.append(" const unsigned int num_intervals_active, \n");
288  source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
289  source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
290  source.append(" unsigned int converged \n");
291  source.append(" ) \n");
292  source.append(" { \n");
293  source.append(" uint glb_id = get_global_id(0); \n");
294  source.append(" uint grp_id = get_group_id(0); \n");
295  source.append(" uint grp_nm = get_num_groups(0); \n");
296  source.append(" uint lcl_id = get_local_id(0); \n");
297  source.append(" uint lcl_sz = get_local_size(0); \n");
298 
299 
300  source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
301  source.append(" unsigned int count = 0; \n");
302 
303  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
304 
305  // read data into shared memory
306  source.append(" if (lcl_id < n) \n");
307  source.append(" { \n");
308  source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n");
309  source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
310  source.append(" } \n");
311 
312  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
313 
314  // perform loop only for active threads
315  source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n");
316  source.append(" { \n");
317 
318  // perform (optimized) Gaussian elimination to determine the number
319  // of eigenvalues that are smaller than n
320  source.append(" for (unsigned int k = 0; k < n; ++k) \n");
321  source.append(" { \n");
322  source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
323  source.append(" count += (delta < 0) ? 1 : 0; \n");
324  source.append(" } \n");
325 
326  source.append(" } \n"); // end if thread currently processing an interval
327 
328  source.append(" return count; \n");
329  source.append(" } \n");
330 
331  }
332 
333 
335 
347  template <typename StringType>
348  void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
349  {
350  source.append(" \n");
351  source.append(" inline unsigned int \n");
352  source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
353  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
354  source.append(" const unsigned int n, \n");
355  source.append(" const "); source.append(numeric_string); source.append(" x, \n");
356  source.append(" const unsigned int tid, \n");
357  source.append(" const unsigned int num_intervals_active, \n");
358  source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
359  source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
360  source.append(" unsigned int converged \n");
361  source.append(" ) \n");
362  source.append(" { \n");
363  source.append(" uint glb_id = get_global_id(0); \n");
364  source.append(" uint grp_id = get_group_id(0); \n");
365  source.append(" uint grp_nm = get_num_groups(0); \n");
366  source.append(" uint lcl_id = get_local_id(0); \n");
367  source.append(" uint lcl_sz = get_local_size(0); \n");
368 
369  source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
370  source.append(" unsigned int count = 0; \n");
371 
372  source.append(" unsigned int rem = n; \n");
373 
374  // do until whole diagonal and superdiagonal has been loaded and processed
375  source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
376  source.append(" { \n");
377 
378  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
379 
380  // read new chunk of data into shared memory
381  source.append(" if ((i + lcl_id) < n) \n");
382  source.append(" { \n");
383 
384  source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
385  source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
386  source.append(" } \n");
387 
388  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
389 
390 
391  source.append(" if (tid < num_intervals_active) \n");
392  source.append(" { \n");
393 
394  // perform (optimized) Gaussian elimination to determine the number
395  // of eigenvalues that are smaller than n
396  source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
397  source.append(" { \n");
398  source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
399  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
400  source.append(" count += (delta < 0) ? 1 : 0; \n");
401  source.append(" } \n");
402 
403  source.append(" } \n"); // end if thread currently processing an interval
404 
405  source.append(" rem -= lcl_sz; \n");
406  source.append(" } \n");
407 
408  source.append(" return count; \n");
409  source.append(" } \n");
410 
411 
412  }
413 
415 
434  template<typename StringType>
435  void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
436  {
437  source.append(" \n");
438  source.append(" void \n");
439  source.append(" storeNonEmptyIntervals(unsigned int addr, \n");
440  source.append(" const unsigned int num_threads_active, \n");
441  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
442  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
443  source.append(" __local unsigned int *s_left_count, \n");
444  source.append(" __local unsigned int *s_right_count, \n");
445  source.append(" "); source.append(numeric_string); source.append(" left, \n ");
446  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
447  source.append(" "); source.append(numeric_string); source.append(" right,\n");
448  source.append(" const unsigned int left_count, \n");
449  source.append(" const unsigned int mid_count, \n");
450  source.append(" const unsigned int right_count, \n");
451  source.append(" "); source.append(numeric_string); source.append(" precision, \n");
452  source.append(" __local unsigned int *compact_second_chunk, \n");
453  source.append(" __local unsigned int *s_compaction_list_exc, \n");
454  source.append(" unsigned int *is_active_second) \n");
455  source.append(" { \n");
456  source.append(" uint glb_id = get_global_id(0); \n");
457  source.append(" uint grp_id = get_group_id(0); \n");
458  source.append(" uint grp_nm = get_num_groups(0); \n");
459  source.append(" uint lcl_id = get_local_id(0); \n");
460  source.append(" uint lcl_sz = get_local_size(0); \n");
461 
462  // check if both child intervals are valid
463  source.append(" \n");
464  source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
465  source.append(" { \n");
466 
467  // store the left interval
468  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
469  source.append(" left, mid, left_count, mid_count, precision); \n");
470 
471  // mark that a second interval has been generated, only stored after
472  // stream compaction of second chunk
473  source.append(" *is_active_second = 1; \n");
474  source.append(" s_compaction_list_exc[lcl_id] = 1; \n");
475  source.append(" *compact_second_chunk = 1; \n");
476  source.append(" } \n");
477  source.append(" else \n");
478  source.append(" { \n");
479 
480  // only one non-empty child interval
481 
482  // mark that no second child
483  source.append(" *is_active_second = 0; \n");
484  source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
485 
486  // store the one valid child interval
487  source.append(" if (left_count != mid_count) \n");
488  source.append(" { \n");
489  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
490  source.append(" left, mid, left_count, mid_count, precision); \n");
491  source.append(" } \n");
492  source.append(" else \n");
493  source.append(" { \n");
494  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
495  source.append(" mid, right, mid_count, right_count, precision); \n");
496  source.append(" } \n");
497 
498  source.append(" } \n");
499  source.append(" } \n");
500 
501  }
502 
503 
505  template <typename StringType>
506  void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
507  {
508  source.append(" \n");
509  source.append(" void \n");
510  source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
511  source.append(" const unsigned int num_threads_active, \n");
512  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
513  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
514  source.append(" __local unsigned short *s_left_count, \n");
515  source.append(" __local unsigned short *s_right_count, \n");
516  source.append(" "); source.append(numeric_string); source.append(" left, \n ");
517  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
518  source.append(" "); source.append(numeric_string); source.append(" right,\n");
519  source.append(" const unsigned int left_count, \n");
520  source.append(" const unsigned int mid_count, \n");
521  source.append(" const unsigned int right_count, \n");
522  source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
523  source.append(" __local unsigned int *compact_second_chunk, \n");
524  source.append(" __local unsigned short *s_compaction_list, \n");
525  source.append(" unsigned int *is_active_second) \n");
526  source.append(" { \n");
527  source.append(" uint glb_id = get_global_id(0); \n");
528  source.append(" uint grp_id = get_group_id(0); \n");
529  source.append(" uint grp_nm = get_num_groups(0); \n");
530  source.append(" uint lcl_id = get_local_id(0); \n");
531  source.append(" uint lcl_sz = get_local_size(0); \n");
532 
533  // check if both child intervals are valid
534  source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
535  source.append(" { \n");
536 
537  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
538  source.append(" left, mid, left_count, mid_count, epsilon); \n");
539 
540  source.append(" *is_active_second = 1; \n");
541  source.append(" s_compaction_list[lcl_id] = 1; \n");
542  source.append(" *compact_second_chunk = 1; \n");
543  source.append(" } \n");
544  source.append(" else \n");
545  source.append(" { \n");
546 
547  // only one non-empty child interval
548 
549  // mark that no second child
550  source.append(" *is_active_second = 0; \n");
551  source.append(" s_compaction_list[lcl_id] = 0; \n");
552 
553  // store the one valid child interval
554  source.append(" if (left_count != mid_count) \n");
555  source.append(" { \n");
556  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
557  source.append(" left, mid, left_count, mid_count, epsilon); \n");
558  source.append(" } \n");
559  source.append(" else \n");
560  source.append(" { \n");
561  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
562  source.append(" mid, right, mid_count, right_count, epsilon); \n");
563  source.append(" } \n");
564  source.append(" } \n");
565  source.append(" } \n");
566  }
567 
578  template<typename StringType>
579  void generate_bisect_kernel_createIndicesCompaction(StringType & source, std::string const & numeric_string)
580  {
581  (void)numeric_string;
582  source.append(" \n");
583  source.append(" void \n");
584  source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
585  source.append(" unsigned int num_threads_compaction) \n");
586  source.append(" { \n");
587  source.append(" uint glb_id = get_global_id(0); \n");
588  source.append(" uint grp_id = get_group_id(0); \n");
589  source.append(" uint grp_nm = get_num_groups(0); \n");
590  source.append(" uint lcl_id = get_local_id(0); \n");
591  source.append(" uint lcl_sz = get_local_size(0); \n");
592 
593 
594  source.append(" unsigned int offset = 1; \n");
595  source.append(" const unsigned int tid = lcl_id; \n");
596  // if(tid == 0)
597  // printf("num_threads_compaction = %u\n", num_threads_compaction);
598 
599  // higher levels of scan tree
600  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
601  source.append(" { \n");
602 
603  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
604 
605  source.append(" if (tid < d) \n");
606  source.append(" { \n");
607 
608  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
609  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
610  source.append(" \n");
611  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
612  source.append(" + s_compaction_list_exc[ai]; \n");
613  source.append(" } \n");
614 
615  source.append(" offset <<= 1; \n");
616  source.append(" } \n");
617 
618  // traverse down tree: first down to level 2 across
619  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
620  source.append(" { \n");
621 
622  source.append(" offset >>= 1; \n");
623  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
624 
625  source.append(" if (tid < (d-1)) \n");
626  source.append(" { \n");
627 
628  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
629  source.append(" unsigned int bi = ai + (offset >> 1); \n");
630 
631  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
632  source.append(" + s_compaction_list_exc[ai]; \n");
633  source.append(" } \n");
634  source.append(" } \n");
635 
636  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
637 
638  source.append(" } \n");
639  }
640 
641 
642  template<typename StringType>
643  void generate_bisect_kernel_createIndicesCompactionShort(StringType & source, std::string const & numeric_string)
644  {
645  (void)numeric_string;
646  source.append(" \n");
647  source.append(" void \n");
648  source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
649  source.append(" unsigned int num_threads_compaction) \n");
650  source.append(" { \n");
651  source.append(" uint glb_id = get_global_id(0); \n");
652  source.append(" uint grp_id = get_group_id(0); \n");
653  source.append(" uint grp_nm = get_num_groups(0); \n");
654  source.append(" uint lcl_id = get_local_id(0); \n");
655  source.append(" uint lcl_sz = get_local_size(0); \n");
656 
657 
658  source.append(" unsigned int offset = 1; \n");
659  source.append(" const unsigned int tid = lcl_id; \n");
660 
661  // higher levels of scan tree
662  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
663  source.append(" { \n");
664 
665  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
666 
667  source.append(" if (tid < d) \n");
668  source.append(" { \n");
669 
670  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
671  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
672  source.append(" \n");
673  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
674  source.append(" + s_compaction_list_exc[ai]; \n");
675  source.append(" } \n");
676 
677  source.append(" offset <<= 1; \n");
678  source.append(" } \n");
679 
680  // traverse down tree: first down to level 2 across
681  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
682  source.append(" { \n");
683 
684  source.append(" offset >>= 1; \n");
685  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
686 
687  source.append(" if (tid < (d-1)) \n");
688  source.append(" { \n");
689 
690  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
691  source.append(" unsigned int bi = ai + (offset >> 1); \n");
692 
693  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
694  source.append(" + s_compaction_list_exc[ai]; \n");
695  source.append(" } \n");
696  source.append(" } \n");
697 
698  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
699 
700  source.append(" } \n");
701  }
702 
704 
718  template<typename StringType>
719  void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
720  {
721  source.append(" \n");
722  source.append(" void \n");
723  source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n");
724  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
725  source.append(" __local unsigned int *s_left_count, \n");
726  source.append(" __local unsigned int *s_right_count, \n");
727  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
728  source.append(" "); source.append(numeric_string); source.append(" right, \n");
729  source.append(" unsigned int mid_count, unsigned int right_count, \n");
730  source.append(" __local unsigned int *s_compaction_list, \n");
731  source.append(" unsigned int num_threads_active, \n");
732  source.append(" unsigned int is_active_second) \n");
733  source.append(" { \n");
734  source.append(" uint glb_id = get_global_id(0); \n");
735  source.append(" uint grp_id = get_group_id(0); \n");
736  source.append(" uint grp_nm = get_num_groups(0); \n");
737  source.append(" uint lcl_id = get_local_id(0); \n");
738  source.append(" uint lcl_sz = get_local_size(0); \n");
739 
740  source.append(" const unsigned int tid = lcl_id; \n");
741 
742  // perform compaction / copy data for all threads where the second
743  // child is not dead
744  source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
745  source.append(" { \n");
746  source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
747  source.append(" s_left[addr_w] = mid; \n");
748  source.append(" s_right[addr_w] = right; \n");
749  source.append(" s_left_count[addr_w] = mid_count; \n");
750  source.append(" s_right_count[addr_w] = right_count; \n");
751  source.append(" } \n");
752  source.append(" } \n");
753  }
754 
755 
756 
757 
758  template<typename StringType>
759  void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
760  {
761  source.append(" \n");
762  source.append(" void \n");
763  source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
764  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
765  source.append(" __local unsigned short *s_left_count, \n");
766  source.append(" __local unsigned short *s_right_count, \n");
767  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
768  source.append(" "); source.append(numeric_string); source.append(" right, \n");
769  source.append(" unsigned int mid_count, unsigned int right_count, \n");
770  source.append(" __local unsigned short *s_compaction_list, \n");
771  source.append(" unsigned int num_threads_active, \n");
772  source.append(" unsigned int is_active_second) \n");
773  source.append(" { \n");
774  source.append(" uint glb_id = get_global_id(0); \n");
775  source.append(" uint grp_id = get_group_id(0); \n");
776  source.append(" uint grp_nm = get_num_groups(0); \n");
777  source.append(" uint lcl_id = get_local_id(0); \n");
778  source.append(" uint lcl_sz = get_local_size(0); \n");
779 
780  source.append(" const unsigned int tid = lcl_id; \n");
781 
782  // perform compaction / copy data for all threads where the second
783  // child is not dead
784  source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
785  source.append(" { \n");
786  source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
787  source.append(" s_left[addr_w] = mid; \n");
788  source.append(" s_right[addr_w] = right; \n");
789  source.append(" s_left_count[addr_w] = mid_count; \n");
790  source.append(" s_right_count[addr_w] = right_count; \n");
791  source.append(" } \n");
792  source.append(" } \n");
793  }
794 
795 
796 
797  template<typename StringType>
798  void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
799  {
800  source.append(" \n");
801  source.append(" void \n");
802  source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n");
803  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
804  source.append(" __local unsigned int *s_left_count, \n");
805  source.append(" __local unsigned int *s_right_count, \n");
806  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
807  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
808  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
809  source.append(" unsigned int *left_count, \n");
810  source.append(" unsigned int *mid_count, \n");
811  source.append(" unsigned int *right_count, \n");
812  source.append(" __local unsigned int *s_compaction_list_exc, \n");
813  source.append(" __local unsigned int *compact_second_chunk, \n");
814  source.append(" const unsigned int num_threads_active, \n");
815  source.append(" unsigned int *is_active_second) \n");
816  source.append(" { \n");
817  source.append(" uint glb_id = get_global_id(0); \n");
818  source.append(" uint grp_id = get_group_id(0); \n");
819  source.append(" uint grp_nm = get_num_groups(0); \n");
820  source.append(" uint lcl_id = get_local_id(0); \n");
821  source.append(" uint lcl_sz = get_local_size(0); \n");
822 
823  source.append(" const unsigned int tid = lcl_id; \n");
824  source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
825  // check multiplicity of eigenvalue
826  source.append(" if (1 == multiplicity) \n");
827  source.append(" { \n");
828 
829  // just re-store intervals, simple eigenvalue
830  source.append(" s_left[tid] = *left; \n");
831  source.append(" s_right[tid] = *right; \n");
832  source.append(" s_left_count[tid] = *left_count; \n");
833  source.append(" s_right_count[tid] = *right_count; \n");
834  source.append(" \n");
835 
836  // mark that no second child / clear
837  source.append(" *is_active_second = 0; \n");
838  source.append(" s_compaction_list_exc[tid] = 0; \n");
839  source.append(" } \n");
840  source.append(" else \n");
841  source.append(" { \n");
842 
843  // number of eigenvalues after the split less than mid
844  source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
845 
846  // store left interval
847  source.append(" s_left[tid] = *left; \n");
848  source.append(" s_right[tid] = *right; \n");
849  source.append(" s_left_count[tid] = *left_count; \n");
850  source.append(" s_right_count[tid] = *mid_count; \n");
851  source.append(" *mid = *left; \n");
852 
853  // mark that second child interval exists
854  source.append(" *is_active_second = 1; \n");
855  source.append(" s_compaction_list_exc[tid] = 1; \n");
856  source.append(" *compact_second_chunk = 1; \n");
857  source.append(" } \n");
858  source.append(" } \n");
859  }
860 
861 
862 
863 
864 
865  template<typename StringType>
866  void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
867  {
868  source.append(" \n");
869  source.append(" void \n");
870  source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
871  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
872  source.append(" __local unsigned short *s_left_count, \n");
873  source.append(" __local unsigned short *s_right_count, \n");
874  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
875  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
876  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
877  source.append(" unsigned int *left_count, \n");
878  source.append(" unsigned int *mid_count, \n");
879  source.append(" unsigned int *right_count, \n");
880  source.append(" __local unsigned short *s_compaction_list_exc, \n");
881  source.append(" __local unsigned int *compact_second_chunk, \n");
882  source.append(" const unsigned int num_threads_active, \n");
883  source.append(" unsigned int *is_active_second) \n");
884  source.append(" { \n");
885  source.append(" uint glb_id = get_global_id(0); \n");
886  source.append(" uint grp_id = get_group_id(0); \n");
887  source.append(" uint grp_nm = get_num_groups(0); \n");
888  source.append(" uint lcl_id = get_local_id(0); \n");
889  source.append(" uint lcl_sz = get_local_size(0); \n");
890 
891  source.append(" const unsigned int tid = lcl_id; \n");
892  source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
893  // check multiplicity of eigenvalue
894  source.append(" if (1 == multiplicity) \n");
895  source.append(" { \n");
896 
897  // just re-store intervals, simple eigenvalue
898  source.append(" s_left[tid] = *left; \n");
899  source.append(" s_right[tid] = *right; \n");
900  source.append(" s_left_count[tid] = *left_count; \n");
901  source.append(" s_right_count[tid] = *right_count; \n");
902  source.append(" \n");
903 
904  // mark that no second child / clear
905  source.append(" *is_active_second = 0; \n");
906  source.append(" s_compaction_list_exc[tid] = 0; \n");
907  source.append(" } \n");
908  source.append(" else \n");
909  source.append(" { \n");
910 
911  // number of eigenvalues after the split less than mid
912  source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
913 
914  // store left interval
915  source.append(" s_left[tid] = *left; \n");
916  source.append(" s_right[tid] = *right; \n");
917  source.append(" s_left_count[tid] = *left_count; \n");
918  source.append(" s_right_count[tid] = *mid_count; \n");
919  source.append(" *mid = *left; \n");
920 
921  // mark that second child interval exists
922  source.append(" *is_active_second = 1; \n");
923  source.append(" s_compaction_list_exc[tid] = 1; \n");
924  source.append(" *compact_second_chunk = 1; \n");
925  source.append(" } \n");
926  source.append(" } \n");
927  }
928 
943  template<typename StringType>
944  void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
945  {
946  source.append(" \n");
947  source.append(" void \n");
948  source.append(" subdivideActiveInterval(const unsigned int tid, \n");
949  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
950  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
951  source.append(" __local unsigned int *s_left_count, \n");
952  source.append(" __local unsigned int *s_right_count, \n");
953  source.append(" const unsigned int num_threads_active, \n");
954  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
955  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
956  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
957  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
958  source.append(" __local unsigned int *all_threads_converged) \n");
959  source.append(" { \n");
960  source.append(" uint glb_id = get_global_id(0); \n");
961  source.append(" uint grp_id = get_group_id(0); \n");
962  source.append(" uint grp_nm = get_num_groups(0); \n");
963  source.append(" uint lcl_id = get_local_id(0); \n");
964  source.append(" uint lcl_sz = get_local_size(0); \n");
965 
966  // for all active threads
967  source.append(" if (tid < num_threads_active) \n");
968  source.append(" { \n");
969 
970  source.append(" *left = s_left[tid]; \n");
971  source.append(" *right = s_right[tid]; \n");
972  source.append(" *left_count = s_left_count[tid]; \n");
973  source.append(" *right_count = s_right_count[tid]; \n");
974 
975  // check if thread already converged
976  source.append(" if (*left != *right) \n");
977  source.append(" { \n");
978 
979  source.append(" *mid = computeMidpoint(*left, *right); \n");
980  source.append(" *all_threads_converged = 0; \n");
981  source.append(" } \n");
982  source.append(" else if ((*right_count - *left_count) > 1) \n");
983  source.append(" { \n");
984  // mark as not converged if multiple eigenvalues enclosed
985  // duplicate interval in storeIntervalsConverged()
986  source.append(" *all_threads_converged = 0; \n");
987  source.append(" } \n");
988 
989  source.append(" } \n");
990  // end for all active threads
991  source.append(" } \n");
992  }
993 
994 
995  template<typename StringType>
996  void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
997  {
998  source.append(" \n");
999  source.append(" void \n");
1000  source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n");
1001  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1002  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1003  source.append(" __local unsigned short *s_left_count, \n");
1004  source.append(" __local unsigned short *s_right_count, \n");
1005  source.append(" const unsigned int num_threads_active, \n");
1006  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
1007  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
1008  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
1009  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
1010  source.append(" __local unsigned int *all_threads_converged) \n");
1011  source.append(" { \n");
1012  source.append(" uint glb_id = get_global_id(0); \n");
1013  source.append(" uint grp_id = get_group_id(0); \n");
1014  source.append(" uint grp_nm = get_num_groups(0); \n");
1015  source.append(" uint lcl_id = get_local_id(0); \n");
1016  source.append(" uint lcl_sz = get_local_size(0); \n");
1017 
1018  // for all active threads
1019  source.append(" if (tid < num_threads_active) \n");
1020  source.append(" { \n");
1021 
1022  source.append(" *left = s_left[tid]; \n");
1023  source.append(" *right = s_right[tid]; \n");
1024  source.append(" *left_count = s_left_count[tid]; \n");
1025  source.append(" *right_count = s_right_count[tid]; \n");
1026 
1027  // check if thread already converged
1028  source.append(" if (*left != *right) \n");
1029  source.append(" { \n");
1030 
1031  source.append(" *mid = computeMidpoint(*left, *right); \n");
1032  source.append(" *all_threads_converged = 0; \n");
1033  source.append(" } \n");
1034  source.append(" else if ((*right_count - *left_count) > 1) \n");
1035  source.append(" { \n");
1036  // mark as not converged if multiple eigenvalues enclosed
1037  // duplicate interval in storeIntervalsConverged()
1038  source.append(" *all_threads_converged = 0; \n");
1039  source.append(" } \n");
1040 
1041  source.append(" } \n");
1042  // end for all active threads
1043  source.append(" } \n");
1044  }
1045 
1046  // end of utilities
1047  // start of kernels
1048 
1049 
1061  template <typename StringType>
1062  void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
1063  {
1064  source.append(" __kernel \n");
1065  source.append(" void \n");
1066  source.append(" bisectKernel(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1067  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1068  source.append(" const unsigned int n, \n");
1069  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1070  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1071  source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
1072  source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
1073  source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
1074  source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
1075  source.append(" "); source.append(numeric_string); source.append(" epsilon \n");
1076  source.append(" ) \n");
1077  source.append(" { \n");
1078  source.append(" g_s = g_s + 1; \n");
1079  source.append(" uint glb_id = get_global_id(0); \n");
1080  source.append(" uint grp_id = get_group_id(0); \n");
1081  source.append(" uint grp_nm = get_num_groups(0); \n");
1082  source.append(" uint lcl_id = get_local_id(0); \n");
1083  source.append(" uint lcl_sz = get_local_size(0); \n");
1084 
1085  // intervals (store left and right because the subdivision tree is in general
1086  // not dense
1087  source.append(" __local "); source.append(numeric_string); source.append(" s_left[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1088  source.append(" __local "); source.append(numeric_string); source.append(" s_right[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1089 
1090  // number of eigenvalues that are smaller than s_left / s_right
1091  // (correspondence is realized via indices)
1092  source.append(" __local unsigned int s_left_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1093  source.append(" __local unsigned int s_right_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1094 
1095  // helper for stream compaction
1096  source.append(" __local unsigned int \n");
1097  source.append(" s_compaction_list[MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
1098 
1099  // state variables for whole block
1100  // if 0 then compaction of second chunk of child intervals is not necessary
1101  // (because all intervals had exactly one non-dead child)
1102  source.append(" __local unsigned int compact_second_chunk; \n");
1103  source.append(" __local unsigned int all_threads_converged; \n");
1104 
1105  // number of currently active threads
1106  source.append(" __local unsigned int num_threads_active; \n");
1107 
1108  // number of threads to use for stream compaction
1109  source.append(" __local unsigned int num_threads_compaction; \n");
1110 
1111  // helper for exclusive scan
1112  source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1113 
1114 
1115  // variables for currently processed interval
1116  // left and right limit of active interval
1117  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
1118  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
1119  source.append(" unsigned int left_count = 0; \n");
1120  source.append(" unsigned int right_count = 0; \n");
1121  // midpoint of active interval
1122  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1123  // number of eigenvalues smaller then mid
1124  source.append(" unsigned int mid_count = 0; \n");
1125  // affected from compaction
1126  source.append(" unsigned int is_active_second = 0; \n");
1127 
1128  source.append(" s_compaction_list[lcl_id] = 0; \n");
1129  source.append(" s_left[lcl_id] = 0.0; \n");
1130  source.append(" s_right[lcl_id] = 0.0; \n");
1131  source.append(" s_left_count[lcl_id] = 0; \n");
1132  source.append(" s_right_count[lcl_id] = 0; \n");
1133 
1134  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1135 
1136  // set up initial configuration
1137  source.append(" if (0 == lcl_id) \n");
1138  source.append(" { \n");
1139  source.append(" s_left[0] = lg; \n");
1140  source.append(" s_right[0] = ug; \n");
1141  source.append(" s_left_count[0] = lg_eig_count; \n");
1142  source.append(" s_right_count[0] = ug_eig_count; \n");
1143 
1144  source.append(" compact_second_chunk = 0; \n");
1145  source.append(" num_threads_active = 1; \n");
1146 
1147  source.append(" num_threads_compaction = 1; \n");
1148  source.append(" } \n");
1149 
1150  // for all active threads read intervals from the last level
1151  // the number of (worst case) active threads per level l is 2^l
1152 
1153  source.append(" while (true) \n");
1154  source.append(" { \n");
1155 
1156  source.append(" all_threads_converged = 1; \n");
1157  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1158 
1159  source.append(" is_active_second = 0; \n");
1160  source.append(" subdivideActiveInterval(lcl_id, \n");
1161  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1162  source.append(" num_threads_active, \n");
1163  source.append(" &left, &right, &left_count, &right_count, \n");
1164  source.append(" &mid, &all_threads_converged); \n");
1165  // source.append(" output[lcl_id] = s_left; \n");
1166  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1167 
1168  // check if done
1169  source.append(" if (1 == all_threads_converged) \n");
1170  source.append(" { \n");
1171  source.append(" break; \n");
1172  source.append(" } \n");
1173 
1174  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1175 
1176  // compute number of eigenvalues smaller than mid
1177  // use all threads for reading the necessary matrix data from global
1178  // memory
1179  // use s_left and s_right as scratch space for diagonal and
1180  // superdiagonal of matrix
1181  source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
1182  source.append(" lcl_id, num_threads_active, \n");
1183  source.append(" s_left, s_right, \n");
1184  source.append(" (left == right)); \n");
1185 
1186  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1187 
1188  // store intervals
1189  // for all threads store the first child interval in a continuous chunk of
1190  // memory, and the second child interval -- if it exists -- in a second
1191  // chunk; it is likely that all threads reach convergence up to
1192  // \a epsilon at the same level; furthermore, for higher level most / all
1193  // threads will have only one child, storing the first child compactly will
1194  // (first) avoid to perform a compaction step on the first chunk, (second)
1195  // make it for higher levels (when all threads / intervals have
1196  // exactly one child) unnecessary to perform a compaction of the second
1197  // chunk
1198  source.append(" if (lcl_id < num_threads_active) \n");
1199  source.append(" { \n");
1200 
1201  source.append(" if (left != right) \n");
1202  source.append(" { \n");
1203 
1204  // store intervals
1205  source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
1206  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1207  source.append(" left, mid, right, \n");
1208  source.append(" left_count, mid_count, right_count, \n");
1209  source.append(" epsilon, &compact_second_chunk, \n");
1210  source.append(" s_compaction_list_exc, \n");
1211  source.append(" &is_active_second); \n");
1212  source.append(" } \n");
1213  source.append(" else \n");
1214  source.append(" { \n");
1215 
1216  source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1217  source.append(" &left, &mid, &right, \n");
1218  source.append(" &left_count, &mid_count, &right_count, \n");
1219  source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
1220  source.append(" num_threads_active, \n");
1221  source.append(" &is_active_second); \n");
1222  source.append(" } \n");
1223  source.append(" } \n");
1224 
1225  // necessary so that compact_second_chunk is up-to-date
1226  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1227 
1228  // perform compaction of chunk where second children are stored
1229  // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
1230  // (num_threads_active / 4) threads are needed
1231  source.append(" if (compact_second_chunk > 0) \n");
1232  source.append(" { \n");
1233 
1234  source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1235 
1236  source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1237  source.append(" mid, right, mid_count, right_count, \n");
1238  source.append(" s_compaction_list, num_threads_active, \n");
1239  source.append(" is_active_second); \n");
1240  source.append(" } \n");
1241 
1242  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1243 
1244  source.append(" if (0 == lcl_id) \n");
1245  source.append(" { \n");
1246 
1247  // update number of active threads with result of reduction
1248  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
1249 
1250  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1251 
1252  source.append(" compact_second_chunk = 0; \n");
1253  source.append(" } \n");
1254 
1255  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1256 
1257  source.append(" } \n");
1258 
1259  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1260 
1261  // write resulting intervals to global mem
1262  // for all threads write if they have been converged to an eigenvalue to
1263  // a separate array
1264 
1265  // at most n valid intervals
1266  source.append(" if (lcl_id < n) \n");
1267  source.append(" { \n");
1268  // intervals converged so left and right limit are identical
1269  source.append(" g_left[lcl_id] = s_left[lcl_id]; \n");
1270  // left count is sufficient to have global order
1271  source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
1272  source.append(" } \n");
1273  source.append(" } \n");
1274  }
1275 
1291  template <typename StringType>
1292  void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
1293  {
1294  source.append(" __kernel \n");
1295  source.append(" void \n");
1296  source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1297  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1298  source.append(" const unsigned int n, \n");
1299  source.append(" __global unsigned int *blocks_mult, \n");
1300  source.append(" __global unsigned int *blocks_mult_sum, \n");
1301  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1302  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1303  source.append(" __global unsigned int *g_left_count, \n");
1304  source.append(" __global unsigned int *g_right_count, \n");
1305  source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n");
1306  source.append(" __global unsigned int *g_pos, \n");
1307  source.append(" "); source.append(numeric_string); source.append(" precision \n");
1308  source.append(" ) \n");
1309  source.append(" { \n");
1310  source.append(" g_s = g_s + 1; \n");
1311  source.append(" uint glb_id = get_global_id(0); \n");
1312  source.append(" uint grp_id = get_group_id(0); \n");
1313  source.append(" uint grp_nm = get_num_groups(0); \n");
1314  source.append(" uint lcl_id = get_local_id(0); \n");
1315  source.append(" uint lcl_sz = get_local_size(0); \n");
1316 
1317  source.append(" const unsigned int tid = lcl_id; \n");
1318 
1319  // left and right limits of interval
1320  source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * MAX_THREADS_BLOCK]; \n");
1321  source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * MAX_THREADS_BLOCK]; \n");
1322 
1323  // number of eigenvalues smaller than interval limits
1324  source.append(" __local unsigned int s_left_count[2 * MAX_THREADS_BLOCK]; \n");
1325  source.append(" __local unsigned int s_right_count[2 * MAX_THREADS_BLOCK]; \n");
1326 
1327  // helper array for chunk compaction of second chunk
1328  source.append(" __local unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; \n");
1329  // compaction list helper for exclusive scan
1330  source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1331 
1332  // flag if all threads are converged
1333  source.append(" __local unsigned int all_threads_converged; \n");
1334  // number of active threads
1335  source.append(" __local unsigned int num_threads_active; \n");
1336  // number of threads to employ for compaction
1337  source.append(" __local unsigned int num_threads_compaction; \n");
1338  // flag if second chunk has to be compacted
1339  source.append(" __local unsigned int compact_second_chunk; \n");
1340 
1341  // parameters of block of intervals processed by this block of threads
1342  source.append(" __local unsigned int c_block_start; \n");
1343  source.append(" __local unsigned int c_block_end; \n");
1344  source.append(" __local unsigned int c_block_offset_output; \n");
1345 
1346  // midpoint of currently active interval of the thread
1347  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1348  // number of eigenvalues smaller than \a mid
1349  source.append(" unsigned int mid_count = 0; \n");
1350  // current interval parameter
1351  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
1352  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
1353  source.append(" unsigned int left_count = 0; \n");
1354  source.append(" unsigned int right_count = 0; \n");
1355  // helper for compaction, keep track which threads have a second child
1356  source.append(" unsigned int is_active_second = 0; \n");
1357 
1358  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1359 
1360  // initialize common start conditions
1361  source.append(" if (0 == tid) \n");
1362  source.append(" { \n");
1363 
1364  source.append(" c_block_start = blocks_mult[grp_id]; \n");
1365  source.append(" c_block_end = blocks_mult[grp_id + 1]; \n");
1366  source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
1367  source.append(" \n");
1368 
1369  source.append(" num_threads_active = c_block_end - c_block_start; \n");
1370  source.append(" s_compaction_list[0] = 0; \n");
1371  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1372 
1373  source.append(" all_threads_converged = 1; \n");
1374  source.append(" compact_second_chunk = 0; \n");
1375  source.append(" } \n");
1376  source.append(" s_left_count [tid] = 42; \n");
1377  source.append(" s_right_count[tid] = 42; \n");
1378  source.append(" s_left_count [tid + MAX_THREADS_BLOCK] = 0; \n");
1379  source.append(" s_right_count[tid + MAX_THREADS_BLOCK] = 0; \n");
1380  source.append(" \n");
1381  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1382  source.append(" \n");
1383 
1384  // read data into shared memory
1385  source.append(" if (tid < num_threads_active) \n");
1386  source.append(" { \n");
1387 
1388  source.append(" s_left[tid] = g_left[c_block_start + tid]; \n");
1389  source.append(" s_right[tid] = g_right[c_block_start + tid]; \n");
1390  source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
1391  source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
1392  source.append(" \n");
1393  source.append(" } \n");
1394  source.append(" \n");
1395  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1396  source.append(" unsigned int iter = 0; \n");
1397  // do until all threads converged
1398  source.append(" while (true) \n");
1399  source.append(" { \n");
1400  source.append(" iter++; \n");
1401  //for (int iter=0; iter < 0; iter++) {
1402  source.append(" s_compaction_list[lcl_id] = 0; \n");
1403  source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
1404  source.append(" s_compaction_list[2 * MAX_THREADS_BLOCK] = 0; \n");
1405 
1406  // subdivide interval if currently active and not already converged
1407  source.append(" subdivideActiveInterval(tid, s_left, s_right, \n");
1408  source.append(" s_left_count, s_right_count, \n");
1409  source.append(" num_threads_active, \n");
1410  source.append(" &left, &right, &left_count, &right_count, \n");
1411  source.append(" &mid, &all_threads_converged); \n");
1412  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1413 
1414  // stop if all eigenvalues have been found
1415  source.append(" if (1 == all_threads_converged) \n");
1416  source.append(" { \n");
1417  source.append(" \n");
1418  source.append(" break; \n");
1419  source.append(" } \n");
1420 
1421  // compute number of eigenvalues smaller than mid for active and not
1422  // converged intervals, use all threads for loading data from gmem and
1423  // s_left and s_right as scratch space to store the data load from gmem
1424  // in shared memory
1425  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1426  source.append(" mid, tid, num_threads_active, \n");
1427  source.append(" s_left, s_right, \n");
1428  source.append(" (left == right)); \n");
1429  source.append(" \n");
1430  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1431 
1432  source.append(" if (tid < num_threads_active) \n");
1433  source.append(" { \n");
1434  source.append(" \n");
1435  // store intervals
1436  source.append(" if (left != right) \n");
1437  source.append(" { \n");
1438 
1439  source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n");
1440  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1441  source.append(" left, mid, right, \n");
1442  source.append(" left_count, mid_count, right_count, \n");
1443  source.append(" precision, &compact_second_chunk, \n");
1444  source.append(" s_compaction_list_exc, \n");
1445  source.append(" &is_active_second); \n");
1446  source.append(" \n");
1447  source.append(" } \n");
1448  source.append(" else \n");
1449  source.append(" { \n");
1450 
1451  source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1452  source.append(" &left, &mid, &right, \n");
1453  source.append(" &left_count, &mid_count, &right_count, \n");
1454  source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
1455  source.append(" num_threads_active, \n");
1456  source.append(" &is_active_second); \n");
1457  source.append(" \n");
1458  source.append(" } \n");
1459  source.append(" } \n");
1460 
1461  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1462 
1463  // compact second chunk of intervals if any of the threads generated
1464  // two child intervals
1465  source.append(" if (1 == compact_second_chunk) \n");
1466  source.append(" { \n");
1467 
1468  source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1469  source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1470  source.append(" mid, right, mid_count, right_count, \n");
1471  source.append(" s_compaction_list, num_threads_active, \n");
1472  source.append(" is_active_second); \n");
1473  source.append(" } \n");
1474 
1475  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1476 
1477  // update state variables
1478  source.append(" if (0 == tid) \n");
1479  source.append(" { \n");
1480  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
1481  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1482 
1483  source.append(" compact_second_chunk = 0; \n");
1484  source.append(" all_threads_converged = 1; \n");
1485  source.append(" } \n");
1486 
1487  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1488 
1489  // clear
1490  source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
1491  source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
1492  source.append(" \n");
1493  source.append(" if (num_threads_compaction > lcl_sz) \n");
1494  source.append(" { \n");
1495  source.append(" break; \n");
1496  source.append(" } \n");
1497 
1498 
1499  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1500 
1501  source.append(" } \n"); // end until all threads converged
1502 
1503  // write data back to global memory
1504  source.append(" if (tid < num_threads_active) \n");
1505  source.append(" { \n");
1506 
1507  source.append(" unsigned int addr = c_block_offset_output + tid; \n");
1508  source.append(" \n");
1509  source.append(" g_lambda[addr] = s_left[tid]; \n");
1510  source.append(" g_pos[addr] = s_right_count[tid]; \n");
1511  source.append(" } \n");
1512  source.append(" } \n");
1513  }
1514 
1515 
1527  template <typename StringType>
1528  void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
1529  {
1530  source.append(" __kernel \n");
1531  source.append(" void \n");
1532  source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1533  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1534  source.append(" const unsigned int n, \n");
1535  source.append(" unsigned int num_intervals, \n");
1536  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1537  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1538  source.append(" __global unsigned int *g_pos, \n");
1539  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
1540  source.append(" { \n");
1541  source.append(" g_s = g_s + 1; \n");
1542  source.append(" uint glb_id = get_global_id(0); \n");
1543  source.append(" uint grp_id = get_group_id(0); \n");
1544  source.append(" uint grp_nm = get_num_groups(0); \n");
1545  source.append(" uint lcl_id = get_local_id(0); \n");
1546  source.append(" uint lcl_sz = get_local_size(0); \n");
1547  source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
1548  source.append(" __local "); source.append(numeric_string); source.append(" s_left_scratch[MAX_THREADS_BLOCK]; \n");
1549  source.append(" __local "); source.append(numeric_string); source.append(" s_right_scratch[MAX_THREADS_BLOCK]; \n");
1550  // active interval of thread
1551  // left and right limit of current interval
1552  source.append(" "); source.append(numeric_string); source.append(" left, right; \n");
1553  // number of threads smaller than the right limit (also corresponds to the
1554  // global index of the eigenvalues contained in the active interval)
1555  source.append(" unsigned int right_count; \n");
1556  // flag if current thread converged
1557  source.append(" unsigned int converged = 0; \n");
1558  // midpoint when current interval is subdivided
1559  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1560  // number of eigenvalues less than mid
1561  source.append(" unsigned int mid_count = 0; \n");
1562 
1563  // read data from global memory
1564  source.append(" if (gtid < num_intervals) \n");
1565  source.append(" { \n");
1566  source.append(" left = g_left[gtid]; \n");
1567  source.append(" right = g_right[gtid]; \n");
1568  source.append(" right_count = g_pos[gtid]; \n");
1569  source.append(" } \n");
1570  // flag to determine if all threads converged to eigenvalue
1571  source.append(" __local unsigned int converged_all_threads; \n");
1572  // initialized shared flag
1573  source.append(" if (0 == lcl_id) \n");
1574  source.append(" { \n");
1575  source.append(" converged_all_threads = 0; \n");
1576  source.append(" } \n");
1577  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1578  // process until all threads converged to an eigenvalue
1579  source.append(" while (true) \n");
1580  source.append(" { \n");
1581  source.append(" converged_all_threads = 1; \n");
1582  // update midpoint for all active threads
1583  source.append(" if ((gtid < num_intervals) && (0 == converged)) \n");
1584  source.append(" { \n");
1585  source.append(" mid = computeMidpoint(left, right); \n");
1586  source.append(" } \n");
1587  // find number of eigenvalues that are smaller than midpoint
1588  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1589  source.append(" mid, gtid, num_intervals, \n");
1590  source.append(" s_left_scratch, \n");
1591  source.append(" s_right_scratch, \n");
1592  source.append(" converged); \n");
1593  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1594  // for all active threads
1595  source.append(" if ((gtid < num_intervals) && (0 == converged)) \n");
1596  source.append(" { \n");
1597  // update intervals -- always one child interval survives
1598  source.append(" if (right_count == mid_count) \n");
1599  source.append(" { \n");
1600  source.append(" right = mid; \n");
1601  source.append(" } \n");
1602  source.append(" else \n");
1603  source.append(" { \n");
1604  source.append(" left = mid; \n");
1605  source.append(" } \n");
1606  // check for convergence
1607  source.append(" "); source.append(numeric_string); source.append(" t0 = right - left; \n");
1608  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(right), fabs(left)) * precision; \n");
1609 
1610  source.append(" if (t0 < min(precision, t1)) \n");
1611  source.append(" { \n");
1612  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
1613  source.append(" left = lambda; \n");
1614  source.append(" right = lambda; \n");
1615 
1616  source.append(" converged = 1; \n");
1617  source.append(" } \n");
1618  source.append(" else \n");
1619  source.append(" { \n");
1620  source.append(" converged_all_threads = 0; \n");
1621  source.append(" } \n");
1622  source.append(" } \n");
1623  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1624  source.append(" if (1 == converged_all_threads) \n");
1625  source.append(" { \n");
1626  source.append(" break; \n");
1627  source.append(" } \n");
1628  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1629  source.append(" } \n");
1630  // write data back to global memory
1631  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1632  source.append(" if (gtid < num_intervals) \n");
1633  source.append(" { \n");
1634  // intervals converged so left and right interval limit are both identical
1635  // and identical to the eigenvalue
1636  source.append(" g_left[gtid] = left; \n");
1637  source.append(" } \n");
1638  source.append(" } \n");
1639  }
1640 
1642  template <typename StringType>
1643  void generate_bisect_kernel_writeToGmem(StringType & source, std::string const & numeric_string)
1644  {
1645  source.append(" \n");
1646  source.append(" void writeToGmem(const unsigned int tid, const unsigned int tid_2, \n");
1647  source.append(" const unsigned int num_threads_active, \n");
1648  source.append(" const unsigned int num_blocks_mult, \n");
1649  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_one, \n");
1650  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_one, \n");
1651  source.append(" __global unsigned int *g_pos_one, \n");
1652  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_mult, \n");
1653  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_mult, \n");
1654  source.append(" __global unsigned int *g_left_count_mult, \n");
1655  source.append(" __global unsigned int *g_right_count_mult, \n");
1656  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1657  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1658  source.append(" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1659  source.append(" __global unsigned int *g_blocks_mult, \n");
1660  source.append(" __global unsigned int *g_blocks_mult_sum, \n");
1661  source.append(" __local unsigned short *s_compaction_list, \n");
1662  source.append(" __local unsigned short *s_cl_helper, \n");
1663  source.append(" unsigned int offset_mult_lambda \n");
1664  source.append(" ) \n");
1665  source.append(" { \n");
1666  source.append(" uint glb_id = get_global_id(0); \n");
1667  source.append(" uint grp_id = get_group_id(0); \n");
1668  source.append(" uint grp_nm = get_num_groups(0); \n");
1669  source.append(" uint lcl_id = get_local_id(0); \n");
1670  source.append(" uint lcl_sz = get_local_size(0); \n");
1671 
1672 
1673  source.append(" if (tid < offset_mult_lambda) \n");
1674  source.append(" { \n");
1675 
1676  source.append(" g_left_one[tid] = s_left[tid]; \n");
1677  source.append(" g_right_one[tid] = s_right[tid]; \n");
1678  // right count can be used to order eigenvalues without sorting
1679  source.append(" g_pos_one[tid] = s_right_count[tid]; \n");
1680  source.append(" } \n");
1681  source.append(" else \n");
1682  source.append(" { \n");
1683 
1684  source.append(" \n");
1685  source.append(" g_left_mult[tid - offset_mult_lambda] = s_left[tid]; \n");
1686  source.append(" g_right_mult[tid - offset_mult_lambda] = s_right[tid]; \n");
1687  source.append(" g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; \n");
1688  source.append(" g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; \n");
1689  source.append(" } \n");
1690 
1691  source.append(" if (tid_2 < num_threads_active) \n");
1692  source.append(" { \n");
1693 
1694  source.append(" if (tid_2 < offset_mult_lambda) \n");
1695  source.append(" { \n");
1696 
1697  source.append(" g_left_one[tid_2] = s_left[tid_2]; \n");
1698  source.append(" g_right_one[tid_2] = s_right[tid_2]; \n");
1699  // right count can be used to order eigenvalues without sorting
1700  source.append(" g_pos_one[tid_2] = s_right_count[tid_2]; \n");
1701  source.append(" } \n");
1702  source.append(" else \n");
1703  source.append(" { \n");
1704 
1705  source.append(" g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; \n");
1706  source.append(" g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; \n");
1707  source.append(" g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; \n");
1708  source.append(" g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; \n");
1709  source.append(" } \n");
1710 
1711  source.append(" } \n"); // end writing out data
1712 
1713  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1714 
1715  // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
1716  // s_compaction_list we write the exclusive scan result
1717  source.append(" if (tid <= num_blocks_mult) \n");
1718  source.append(" { \n");
1719  source.append(" g_blocks_mult[tid] = s_compaction_list[tid]; \n");
1720  source.append(" g_blocks_mult_sum[tid] = s_cl_helper[tid]; \n");
1721  source.append(" } \n");
1722  source.append(" if (tid_2 <= num_blocks_mult) \n");
1723  source.append(" { \n");
1724  source.append(" g_blocks_mult[tid_2] = s_compaction_list[tid_2]; \n");
1725  source.append(" g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; \n");
1726  source.append(" } \n");
1727  source.append(" } \n");
1728  }
1729 
1731  template <typename StringType>
1732  void generate_bisect_kernel_compactStreamsFinal(StringType & source, std::string const & numeric_string)
1733  {
1734  source.append(" \n");
1735  source.append(" void \n");
1736  source.append(" compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, \n");
1737  source.append(" const unsigned int num_threads_active, \n");
1738  source.append(" __local unsigned int *offset_mult_lambda, \n");
1739  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1740  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1741  source.append(" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1742  source.append(" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
1743  source.append(" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper, \n");
1744  source.append(" unsigned int is_one_lambda, unsigned int is_one_lambda_2, \n");
1745  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
1746  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
1747  source.append(" "); source.append(numeric_string); source.append(" *left_2, \n");
1748  source.append(" "); source.append(numeric_string); source.append(" *right_2, \n");
1749  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
1750  source.append(" unsigned int *left_count_2, unsigned int *right_count_2, \n");
1751  source.append(" unsigned int c_block_iend, unsigned int c_sum_block, \n");
1752  source.append(" unsigned int c_block_iend_2, unsigned int c_sum_block_2 \n");
1753  source.append(" ) \n");
1754  source.append(" { \n");
1755  source.append(" uint glb_id = get_global_id(0); \n");
1756  source.append(" uint grp_id = get_group_id(0); \n");
1757  source.append(" uint grp_nm = get_num_groups(0); \n");
1758  source.append(" uint lcl_id = get_local_id(0); \n");
1759  source.append(" uint lcl_sz = get_local_size(0); \n");
1760 
1761  // cache data before performing compaction
1762  source.append(" *left = s_left[tid]; \n");
1763  source.append(" *right = s_right[tid]; \n");
1764 
1765  source.append(" if (tid_2 < num_threads_active) \n");
1766  source.append(" { \n");
1767  source.append(" *left_2 = s_left[tid_2]; \n");
1768  source.append(" *right_2 = s_right[tid_2]; \n");
1769  source.append(" } \n");
1770 
1771  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1772 
1773  // determine addresses for intervals containing multiple eigenvalues and
1774  // addresses for blocks of intervals
1775  source.append(" unsigned int ptr_w = 0; \n");
1776  source.append(" unsigned int ptr_w_2 = 0; \n");
1777  source.append(" unsigned int ptr_blocking_w = 0; \n");
1778  source.append(" unsigned int ptr_blocking_w_2 = 0; \n");
1779  source.append(" \n");
1780  source.append(" \n");
1781 
1782  source.append(" ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] \n");
1783  source.append(" : s_cl_mult[tid] + *offset_mult_lambda; \n");
1784 
1785  source.append(" if (0 != c_block_iend) \n");
1786  source.append(" { \n");
1787  source.append(" ptr_blocking_w = s_cl_blocking[tid]; \n");
1788  source.append(" } \n");
1789 
1790  source.append(" if (tid_2 < num_threads_active) \n");
1791  source.append(" { \n");
1792  source.append(" ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] \n");
1793  source.append(" : s_cl_mult[tid_2] + *offset_mult_lambda; \n");
1794 
1795  source.append(" if (0 != c_block_iend_2) \n");
1796  source.append(" { \n");
1797  source.append(" ptr_blocking_w_2 = s_cl_blocking[tid_2]; \n");
1798  source.append(" } \n");
1799  source.append(" } \n");
1800  source.append(" \n");
1801  source.append(" \n");
1802  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1803  source.append(" \n");
1804  // store compactly in shared mem
1805  source.append(" s_left[ptr_w] = *left; \n");
1806  source.append(" s_right[ptr_w] = *right; \n");
1807  source.append(" s_left_count[ptr_w] = *left_count; \n");
1808  source.append(" s_right_count[ptr_w] = *right_count; \n");
1809  source.append(" \n");
1810  source.append(" \n");
1811  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1812  source.append(" if(tid == 1) \n");
1813  source.append(" { \n");
1814  source.append(" s_left[ptr_w] = *left; \n");
1815  source.append(" s_right[ptr_w] = *right; \n");
1816  source.append(" s_left_count[ptr_w] = *left_count; \n");
1817  source.append(" s_right_count[ptr_w] = *right_count; \n");
1818  source.append(" \n");
1819  source.append(" } \n");
1820  source.append(" if (0 != c_block_iend) \n");
1821  source.append(" { \n");
1822  source.append(" s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; \n");
1823  source.append(" s_cl_helper[ptr_blocking_w + 1] = c_sum_block; \n");
1824  source.append(" } \n");
1825  source.append(" \n");
1826  source.append(" if (tid_2 < num_threads_active) \n");
1827  source.append(" { \n");
1828  // store compactly in shared mem
1829  source.append(" s_left[ptr_w_2] = *left_2; \n");
1830  source.append(" s_right[ptr_w_2] = *right_2; \n");
1831  source.append(" s_left_count[ptr_w_2] = *left_count_2; \n");
1832  source.append(" s_right_count[ptr_w_2] = *right_count_2; \n");
1833 
1834  source.append(" if (0 != c_block_iend_2) \n");
1835  source.append(" { \n");
1836  source.append(" s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; \n");
1837  source.append(" s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; \n");
1838  source.append(" } \n");
1839  source.append(" } \n");
1840 
1841  source.append(" } \n");
1842  }
1843 
1844 
1845 
1847  template <typename StringType>
1848  void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType & source, std::string const & numeric_string)
1849  {
1850  (void)numeric_string;
1851  source.append(" \n");
1852  source.append(" void \n");
1853  source.append(" scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, \n");
1854  source.append(" const unsigned int num_threads_compaction, \n");
1855  source.append(" __local unsigned short *s_cl_blocking, \n");
1856  source.append(" __local unsigned short *s_cl_helper \n");
1857  source.append(" ) \n");
1858  source.append(" { \n");
1859  source.append(" uint glb_id = get_global_id(0); \n");
1860  source.append(" uint grp_id = get_group_id(0); \n");
1861  source.append(" uint grp_nm = get_num_groups(0); \n");
1862  source.append(" uint lcl_id = get_local_id(0); \n");
1863  source.append(" uint lcl_sz = get_local_size(0); \n");
1864 
1865  // prepare for second step of block generation: compaction of the block
1866  // list itself to efficiently write out these
1867  source.append(" s_cl_blocking[tid] = s_cl_helper[tid]; \n");
1868 
1869  source.append(" if (tid_2 < num_threads_compaction) \n");
1870  source.append(" { \n");
1871  source.append(" s_cl_blocking[tid_2] = s_cl_helper[tid_2]; \n");
1872  source.append(" } \n");
1873 
1874  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1875 
1876  // additional scan to compact s_cl_blocking that permits to generate a
1877  // compact list of eigenvalue blocks each one containing about
1878  // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
1879  // processed by one thread block in a subsequent processing step
1880 
1881  source.append(" unsigned int offset = 1; \n");
1882 
1883  // build scan tree
1884  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
1885  source.append(" { \n");
1886 
1887  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1888 
1889  source.append(" if (tid < d) \n");
1890  source.append(" { \n");
1891 
1892  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
1893  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
1894  source.append(" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1895  source.append(" } \n");
1896 
1897  source.append(" offset <<= 1; \n");
1898  source.append(" } \n");
1899 
1900  // traverse down tree: first down to level 2 across
1901  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
1902  source.append(" { \n");
1903 
1904  source.append(" offset >>= 1; \n");
1905  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1906 
1907  //
1908  source.append(" if (tid < (d-1)) \n");
1909  source.append(" { \n");
1910 
1911  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
1912  source.append(" unsigned int bi = ai + (offset >> 1); \n");
1913  source.append(" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1914  source.append(" } \n");
1915  source.append(" } \n");
1916 
1917  source.append(" } \n");
1918 
1919 
1920  }
1921 
1922 
1924  template <typename StringType>
1925  void generate_bisect_kernel_scanSumBlocks(StringType & source, std::string const & numeric_string)
1926  {
1927  (void)numeric_string;
1928  source.append(" \n");
1929  source.append(" void \n");
1930  source.append(" scanSumBlocks(const unsigned int tid, const unsigned int tid_2, \n");
1931  source.append(" const unsigned int num_threads_active, \n");
1932  source.append(" const unsigned int num_threads_compaction, \n");
1933  source.append(" __local unsigned short *s_cl_blocking, \n");
1934  source.append(" __local unsigned short *s_cl_helper) \n");
1935  source.append(" { \n");
1936  source.append(" uint glb_id = get_global_id(0); \n");
1937  source.append(" uint grp_id = get_group_id(0); \n");
1938  source.append(" uint grp_nm = get_num_groups(0); \n");
1939  source.append(" uint lcl_id = get_local_id(0); \n");
1940  source.append(" uint lcl_sz = get_local_size(0); \n");
1941 
1942  source.append(" unsigned int offset = 1; \n");
1943 
1944  // first step of scan to build the sum of elements within each block
1945  // build up tree
1946  source.append(" for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) \n");
1947  source.append(" { \n");
1948 
1949  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1950 
1951  source.append(" if (tid < d) \n");
1952  source.append(" { \n");
1953 
1954  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
1955  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
1956 
1957  source.append(" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1958  source.append(" } \n");
1959 
1960  source.append(" offset *= 2; \n");
1961  source.append(" } \n");
1962 
1963  // first step of scan to build the sum of elements within each block
1964  // traverse down tree
1965  source.append(" for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) \n");
1966  source.append(" { \n");
1967 
1968  source.append(" offset >>= 1; \n");
1969  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1970 
1971  source.append(" if (tid < (d-1)) \n");
1972  source.append(" { \n");
1973  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
1974  source.append(" unsigned int bi = ai + (offset >> 1); \n");
1975  source.append(" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1976  source.append(" } \n");
1977  source.append(" } \n");
1978  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1979 
1980  source.append(" if (0 == tid) \n");
1981  source.append(" { \n");
1982 
1983  // move last element of scan to last element that is valid
1984  // necessary because the number of threads employed for scan is a power
1985  // of two and not necessarily the number of active threasd
1986  source.append(" s_cl_helper[num_threads_active - 1] = \n");
1987  source.append(" s_cl_helper[num_threads_compaction - 1]; \n");
1988  source.append(" s_cl_blocking[num_threads_active - 1] = \n");
1989  source.append(" s_cl_blocking[num_threads_compaction - 1]; \n");
1990  source.append(" } \n");
1991  source.append(" } \n");
1992 
1993 
1994  }
1995 
1997  template <typename StringType>
1998  void generate_bisect_kernel_scanInitial(StringType & source, std::string const & numeric_string)
1999  {
2000  (void)numeric_string;
2001  source.append(" \n");
2002  source.append(" void \n");
2003  source.append(" scanInitial(const unsigned int tid, const unsigned int tid_2, \n");
2004  source.append(" const unsigned int num_threads_active, \n");
2005  source.append(" const unsigned int num_threads_compaction, \n");
2006  source.append(" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
2007  source.append(" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper \n");
2008  source.append(" ) \n");
2009  source.append(" { \n");
2010  source.append(" uint glb_id = get_global_id(0); \n");
2011  source.append(" uint grp_id = get_group_id(0); \n");
2012  source.append(" uint grp_nm = get_num_groups(0); \n");
2013  source.append(" uint lcl_id = get_local_id(0); \n");
2014  source.append(" uint lcl_sz = get_local_size(0); \n");
2015 
2016 
2017  // perform scan to compactly write out the intervals containing one and
2018  // multiple eigenvalues
2019  // also generate tree for blocking of intervals containing multiple
2020  // eigenvalues
2021 
2022  source.append(" unsigned int offset = 1; \n");
2023 
2024  // build scan tree
2025  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
2026  source.append(" { \n");
2027 
2028  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2029 
2030  source.append(" if (tid < d) \n");
2031  source.append(" { \n");
2032 
2033  source.append(" unsigned int ai = offset*(2*tid+1); \n");
2034  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
2035 
2036  source.append(" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; \n");
2037  source.append(" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; \n");
2038 
2039  // s_cl_helper is binary and zero for an internal node and 1 for a
2040  // root node of a tree corresponding to a block
2041  // s_cl_blocking contains the number of nodes in each sub-tree at each
2042  // iteration, the data has to be kept to compute the total number of
2043  // eigenvalues per block that, in turn, is needed to efficiently
2044  // write out data in the second step
2045  source.append(" if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) \n");
2046  source.append(" { \n");
2047 
2048  // check how many childs are non terminated
2049  source.append(" if (s_cl_helper[ai - 1] == 1) \n");
2050  source.append(" { \n");
2051  // mark as terminated
2052  source.append(" s_cl_helper[bi] = 1; \n");
2053  source.append(" } \n");
2054  source.append(" else if (s_cl_helper[bi] == 1) \n");
2055  source.append(" { \n");
2056  // mark as terminated
2057  source.append(" s_cl_helper[ai - 1] = 1; \n");
2058  source.append(" } \n");
2059  source.append(" else \n"); // both childs are non-terminated
2060  source.append(" { \n");
2061 
2062  source.append(" unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; \n");
2063 
2064  source.append(" if (temp > MAX_THREADS_BLOCK) \n");
2065  source.append(" { \n");
2066 
2067  // the two child trees have to form separate blocks, terminate trees
2068  source.append(" s_cl_helper[ai - 1] = 1; \n");
2069  source.append(" s_cl_helper[bi] = 1; \n");
2070  source.append(" } \n");
2071  source.append(" else \n");
2072  source.append(" { \n");
2073  // build up tree by joining subtrees
2074  source.append(" s_cl_blocking[bi] = temp; \n");
2075  source.append(" s_cl_blocking[ai - 1] = 0; \n");
2076  source.append(" } \n");
2077  source.append(" } \n");
2078  source.append(" } \n"); // end s_cl_helper update
2079  source.append(" } \n");
2080  source.append(" offset <<= 1; \n");
2081  source.append(" } \n");
2082 
2083 
2084  // traverse down tree, this only for stream compaction, not for block
2085  // construction
2086  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
2087  source.append(" { \n");
2088  source.append(" offset >>= 1; \n");
2089  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2090  //
2091  source.append(" if (tid < (d-1)) \n");
2092  source.append(" { \n");
2093  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
2094  source.append(" unsigned int bi = ai + (offset >> 1); \n");
2095  source.append(" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; \n");
2096  source.append(" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; \n");
2097  source.append(" } \n");
2098  source.append(" } \n");
2099  source.append(" } \n");
2100  }
2101 
2113  template <typename StringType>
2114  void generate_bisect_kernel_bisectKernelLarge(StringType & source, std::string const & numeric_string)
2115  {
2116  source.append(" __kernel \n");
2117  source.append(" void \n");
2118  source.append(" bisectKernelLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
2119  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
2120  source.append(" const unsigned int n, \n");
2121  source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
2122  source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
2123  source.append(" const unsigned int lg_eig_count, \n");
2124  source.append(" const unsigned int ug_eig_count, \n");
2125  source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
2126  source.append(" __global unsigned int *g_num_one, \n");
2127  source.append(" __global unsigned int *g_num_blocks_mult, \n");
2128  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_one, \n");
2129  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_one, \n");
2130  source.append(" __global unsigned int *g_pos_one, \n");
2131  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_mult, \n");
2132  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_mult, \n");
2133  source.append(" __global unsigned int *g_left_count_mult, \n");
2134  source.append(" __global unsigned int *g_right_count_mult, \n");
2135  source.append(" __global unsigned int *g_blocks_mult, \n");
2136  source.append(" __global unsigned int *g_blocks_mult_sum \n");
2137  source.append(" ) \n");
2138  source.append(" { \n");
2139  source.append(" g_s = g_s + 1; \n");
2140  source.append(" uint glb_id = get_global_id(0); \n");
2141  source.append(" uint grp_id = get_group_id(0); \n");
2142  source.append(" uint grp_nm = get_num_groups(0); \n");
2143  source.append(" uint lcl_id = get_local_id(0); \n");
2144  source.append(" uint lcl_sz = get_local_size(0); \n");
2145 
2146  source.append(" const unsigned int tid = lcl_id; \n");
2147 
2148  // intervals (store left and right because the subdivision tree is in general
2149  // not dense
2150  source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * MAX_THREADS_BLOCK + 1]; \n");
2151  source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * MAX_THREADS_BLOCK + 1]; \n");
2152 
2153  // number of eigenvalues that are smaller than s_left / s_right
2154  // (correspondence is realized via indices)
2155  source.append(" __local unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1]; \n");
2156  source.append(" __local unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1]; \n");
2157 
2158  // helper for stream compaction
2159  source.append(" __local unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; \n");
2160 
2161  // state variables for whole block
2162  // if 0 then compaction of second chunk of child intervals is not necessary
2163  // (because all intervals had exactly one non-dead child)
2164  source.append(" __local unsigned int compact_second_chunk; \n");
2165  // if 1 then all threads are converged
2166  source.append(" __local unsigned int all_threads_converged; \n");
2167 
2168  // number of currently active threads
2169  source.append(" __local unsigned int num_threads_active; \n");
2170 
2171  // number of threads to use for stream compaction
2172  source.append(" __local unsigned int num_threads_compaction; \n");
2173 
2174  // helper for exclusive scan
2175  source.append(" __local unsigned short *s_compaction_list_exc = s_compaction_list + 1; \n");
2176 
2177  // variables for currently processed interval
2178  // left and right limit of active interval
2179  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
2180  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
2181  source.append(" unsigned int left_count = 0; \n");
2182  source.append(" unsigned int right_count = 0; \n");
2183  // midpoint of active interval
2184  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
2185  // number of eigenvalues smaller then mid
2186  source.append(" unsigned int mid_count = 0; \n");
2187  // helper for stream compaction (tracking of threads generating second child)
2188  source.append(" unsigned int is_active_second = 0; \n");
2189 
2190  // initialize lists
2191  source.append(" s_compaction_list[tid] = 0; \n");
2192  source.append(" s_left[tid] = 0; \n");
2193  source.append(" s_right[tid] = 0; \n");
2194  source.append(" s_left_count[tid] = 0; \n");
2195  source.append(" s_right_count[tid] = 0; \n");
2196 
2197  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2198 
2199  // set up initial configuration
2200  source.append(" if (0 == tid) \n");
2201  source.append(" { \n");
2202 
2203  source.append(" s_left[0] = lg; \n");
2204  source.append(" s_right[0] = ug; \n");
2205  source.append(" s_left_count[0] = lg_eig_count; \n");
2206  source.append(" s_right_count[0] = ug_eig_count; \n");
2207 
2208  source.append(" compact_second_chunk = 0; \n");
2209  source.append(" num_threads_active = 1; \n");
2210 
2211  source.append(" num_threads_compaction = 1; \n");
2212 
2213  source.append(" all_threads_converged = 1; \n");
2214  source.append(" } \n");
2215 
2216  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2217 
2218  // for all active threads read intervals from the last level
2219  // the number of (worst case) active threads per level l is 2^l
2220  // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
2221  // will be executed in the second step
2222  source.append(" for( unsigned int i = 0; i < 15; ++i ) \n");
2223  source.append(" { \n");
2224  source.append(" s_compaction_list[tid] = 0; \n");
2225  source.append(" s_compaction_list[tid + MAX_THREADS_BLOCK] = 0; \n");
2226  source.append(" s_compaction_list[2 * MAX_THREADS_BLOCK] = 0; \n");
2227  source.append(" subdivideActiveIntervalShort(tid, s_left, s_right, s_left_count, s_right_count, \n");
2228  source.append(" num_threads_active, \n");
2229  source.append(" &left, &right, &left_count, &right_count, \n");
2230  source.append(" &mid, &all_threads_converged); \n");
2231 
2232  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2233 
2234  // check if done
2235  source.append(" if (1 == all_threads_converged) \n");
2236  source.append(" { \n");
2237  source.append(" break; \n");
2238  source.append(" } \n");
2239 
2240  // compute number of eigenvalues smaller than mid
2241  // use all threads for reading the necessary matrix data from global
2242  // memory
2243  // use s_left and s_right as scratch space for diagonal and
2244  // superdiagonal of matrix
2245  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
2246  source.append(" mid, lcl_id, \n");
2247  source.append(" num_threads_active, \n");
2248  source.append(" s_left, s_right, \n");
2249  source.append(" (left == right)); \n");
2250 
2251  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2252 
2253  // store intervals
2254  // for all threads store the first child interval in a continuous chunk of
2255  // memory, and the second child interval -- if it exists -- in a second
2256  // chunk; it is likely that all threads reach convergence up to
2257  // \a epsilon at the same level; furthermore, for higher level most / all
2258  // threads will have only one child, storing the first child compactly will
2259  // (first) avoid to perform a compaction step on the first chunk, (second)
2260  // make it for higher levels (when all threads / intervals have
2261  // exactly one child) unnecessary to perform a compaction of the second
2262  // chunk
2263  source.append(" if (tid < num_threads_active) \n");
2264  source.append(" { \n");
2265 
2266  source.append(" if (left != right) \n");
2267  source.append(" { \n");
2268 
2269  // store intervals
2270  source.append(" storeNonEmptyIntervalsLarge(tid, num_threads_active, \n");
2271  source.append(" s_left, s_right, \n");
2272  source.append(" s_left_count, s_right_count, \n");
2273  source.append(" left, mid, right, \n");
2274  source.append(" left_count, mid_count, right_count, \n");
2275  source.append(" epsilon, &compact_second_chunk, \n");
2276  source.append(" s_compaction_list_exc, \n");
2277  source.append(" &is_active_second); \n");
2278  source.append(" } \n");
2279  source.append(" else \n");
2280  source.append(" { \n");
2281 
2282  // re-write converged interval (has to be stored again because s_left
2283  // and s_right are used as scratch space for
2284  // computeNumSmallerEigenvalsLarge()
2285  source.append(" s_left[tid] = left; \n");
2286  source.append(" s_right[tid] = left; \n");
2287  source.append(" s_left_count[tid] = left_count; \n");
2288  source.append(" s_right_count[tid] = right_count; \n");
2289 
2290  source.append(" is_active_second = 0; \n");
2291  source.append(" } \n");
2292  source.append(" } \n");
2293 
2294  // necessary so that compact_second_chunk is up-to-date
2295  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2296 
2297  // perform compaction of chunk where second children are stored
2298  // scan of (num_threads_active / 2) elements, thus at most
2299  // (num_threads_active / 4) threads are needed
2300  source.append(" if (compact_second_chunk > 0) \n");
2301  source.append(" { \n");
2302 
2303  // create indices for compaction
2304  source.append(" createIndicesCompactionShort(s_compaction_list_exc, num_threads_compaction); \n");
2305  source.append(" } \n");
2306  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2307  source.append(" \n");
2308  source.append(" if (compact_second_chunk > 0) \n");
2309  source.append(" { \n");
2310  source.append(" compactIntervalsShort(s_left, s_right, s_left_count, s_right_count, \n");
2311  source.append(" mid, right, mid_count, right_count, \n");
2312  source.append(" s_compaction_list, num_threads_active, \n");
2313  source.append(" is_active_second); \n");
2314  source.append(" } \n");
2315 
2316  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2317 
2318  // update state variables
2319  source.append(" if (0 == tid) \n");
2320  source.append(" { \n");
2321 
2322  // update number of active threads with result of reduction
2323  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
2324  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
2325 
2326  source.append(" compact_second_chunk = 0; \n");
2327  source.append(" all_threads_converged = 1; \n");
2328  source.append(" } \n");
2329  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2330  source.append(" if (num_threads_compaction > lcl_sz) \n");
2331  source.append(" { \n");
2332  source.append(" break; \n");
2333  source.append(" } \n");
2334  source.append(" } \n");
2335  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2336 
2337  // generate two lists of intervals; one with intervals that contain one
2338  // eigenvalue (or are converged), and one with intervals that need further
2339  // subdivision
2340 
2341  // perform two scans in parallel
2342 
2343  source.append(" unsigned int left_count_2; \n");
2344  source.append(" unsigned int right_count_2; \n");
2345 
2346  source.append(" unsigned int tid_2 = tid + lcl_sz; \n");
2347 
2348  // cache in per thread registers so that s_left_count and s_right_count
2349  // can be used for scans
2350  source.append(" left_count = s_left_count[tid]; \n");
2351  source.append(" right_count = s_right_count[tid]; \n");
2352 
2353  // some threads have to cache data for two intervals
2354  source.append(" if (tid_2 < num_threads_active) \n");
2355  source.append(" { \n");
2356  source.append(" left_count_2 = s_left_count[tid_2]; \n");
2357  source.append(" right_count_2 = s_right_count[tid_2]; \n");
2358  source.append(" } \n");
2359 
2360  // compaction list for intervals containing one and multiple eigenvalues
2361  // do not affect first element for exclusive scan
2362  source.append(" __local unsigned short *s_cl_one = s_left_count + 1; \n");
2363  source.append(" __local unsigned short *s_cl_mult = s_right_count + 1; \n");
2364 
2365  // compaction list for generating blocks of intervals containing multiple
2366  // eigenvalues
2367  source.append(" __local unsigned short *s_cl_blocking = s_compaction_list_exc; \n");
2368  // helper compaction list for generating blocks of intervals
2369  source.append(" __local unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1]; \n");
2370 
2371  source.append(" if (0 == tid) \n");
2372  source.append(" { \n");
2373  // set to 0 for exclusive scan
2374  source.append(" s_left_count[0] = 0; \n");
2375  source.append(" s_right_count[0] = 0; \n");
2376  source.append(" \n");
2377  source.append(" } \n");
2378 
2379  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2380 
2381  // flag if interval contains one or multiple eigenvalues
2382  source.append(" unsigned int is_one_lambda = 0; \n");
2383  source.append(" unsigned int is_one_lambda_2 = 0; \n");
2384 
2385  // number of eigenvalues in the interval
2386  source.append(" unsigned int multiplicity = right_count - left_count; \n");
2387  source.append(" is_one_lambda = (1 == multiplicity); \n");
2388 
2389  source.append(" s_cl_one[tid] = is_one_lambda; \n");
2390  source.append(" s_cl_mult[tid] = (! is_one_lambda); \n");
2391 
2392  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
2393  source.append(" s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; \n");
2394  source.append(" s_cl_helper[tid] = 0; \n");
2395 
2396  source.append(" if (tid_2 < num_threads_active) \n");
2397  source.append(" { \n");
2398 
2399  source.append(" unsigned int multiplicity = right_count_2 - left_count_2; \n");
2400  source.append(" is_one_lambda_2 = (1 == multiplicity); \n");
2401 
2402  source.append(" s_cl_one[tid_2] = is_one_lambda_2; \n");
2403  source.append(" s_cl_mult[tid_2] = (! is_one_lambda_2); \n");
2404 
2405  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
2406  source.append(" s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; \n");
2407  source.append(" s_cl_helper[tid_2] = 0; \n");
2408  source.append(" } \n");
2409  source.append(" else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) \n");
2410  source.append(" { \n");
2411 
2412  // clear
2413  source.append(" s_cl_blocking[tid_2] = 0; \n");
2414  source.append(" s_cl_helper[tid_2] = 0; \n");
2415  source.append(" } \n");
2416 
2417 
2418  source.append(" scanInitial(tid, tid_2, num_threads_active, num_threads_compaction, \n");
2419  source.append(" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); \n");
2420  source.append(" \n");
2421  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2422 
2423  source.append(" scanSumBlocks(tid, tid_2, num_threads_active, \n");
2424  source.append(" num_threads_compaction, s_cl_blocking, s_cl_helper); \n");
2425 
2426  // end down sweep of scan
2427  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2428 
2429  source.append(" unsigned int c_block_iend = 0; \n");
2430  source.append(" unsigned int c_block_iend_2 = 0; \n");
2431  source.append(" unsigned int c_sum_block = 0; \n");
2432  source.append(" unsigned int c_sum_block_2 = 0; \n");
2433 
2434  // for each thread / interval that corresponds to root node of interval block
2435  // store start address of block and total number of eigenvalues in all blocks
2436  // before this block (particular thread is irrelevant, constraint is to
2437  // have a subset of threads so that one and only one of them is in each
2438  // interval)
2439  source.append(" if (1 == s_cl_helper[tid]) \n");
2440  source.append(" { \n");
2441 
2442  source.append(" c_block_iend = s_cl_mult[tid] + 1; \n");
2443  source.append(" c_sum_block = s_cl_blocking[tid]; \n");
2444  source.append(" } \n");
2445 
2446  source.append(" if (1 == s_cl_helper[tid_2]) \n");
2447  source.append(" { \n");
2448 
2449  source.append(" c_block_iend_2 = s_cl_mult[tid_2] + 1; \n");
2450  source.append(" c_sum_block_2 = s_cl_blocking[tid_2]; \n");
2451  source.append(" } \n");
2452 
2453  source.append(" scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, \n");
2454  source.append(" s_cl_blocking, s_cl_helper); \n");
2455 
2456 
2457  // finished second scan for s_cl_blocking
2458  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2459 
2460  // determine the global results
2461  source.append(" __local unsigned int num_blocks_mult; \n");
2462  source.append(" __local unsigned int num_mult; \n");
2463  source.append(" __local unsigned int offset_mult_lambda; \n");
2464 
2465  source.append(" if (0 == tid) \n");
2466  source.append(" { \n");
2467 
2468  source.append(" num_blocks_mult = s_cl_blocking[num_threads_active - 1]; \n");
2469  source.append(" offset_mult_lambda = s_cl_one[num_threads_active - 1]; \n");
2470  source.append(" num_mult = s_cl_mult[num_threads_active - 1]; \n");
2471 
2472  source.append(" *g_num_one = offset_mult_lambda; \n");
2473  source.append(" *g_num_blocks_mult = num_blocks_mult; \n");
2474  source.append(" } \n");
2475 
2476  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2477 
2478  source.append(" "); source.append(numeric_string); source.append(" left_2, right_2; \n");
2479  source.append(" --s_cl_one; \n");
2480  source.append(" --s_cl_mult; \n");
2481  source.append(" --s_cl_blocking; \n");
2482  source.append(" \n");
2483  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2484  source.append(" compactStreamsFinal(tid, tid_2, num_threads_active, &offset_mult_lambda, \n");
2485  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
2486  source.append(" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, \n");
2487  source.append(" is_one_lambda, is_one_lambda_2, \n");
2488  source.append(" &left, &right, &left_2, &right_2, \n");
2489  source.append(" &left_count, &right_count, &left_count_2, &right_count_2, \n");
2490  source.append(" c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 \n");
2491  source.append(" ); \n");
2492 
2493  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2494 
2495  // final adjustment before writing out data to global memory
2496  source.append(" if (0 == tid) \n");
2497  source.append(" { \n");
2498  source.append(" s_cl_blocking[num_blocks_mult] = num_mult; \n");
2499  source.append(" s_cl_helper[0] = 0; \n");
2500  source.append(" } \n");
2501 
2502  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2503 
2504  // write to global memory
2505  source.append(" writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, \n");
2506  source.append(" g_left_one, g_right_one, g_pos_one, \n");
2507  source.append(" g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, \n");
2508  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
2509  source.append(" g_blocks_mult, g_blocks_mult_sum, \n");
2510  source.append(" s_compaction_list, s_cl_helper, offset_mult_lambda); \n");
2511  source.append(" \n");
2512 
2513  source.append(" } \n");
2514  }
2515 
2516  // main kernel class
2520  template <class NumericT>
2522  {
2523  static std::string program_name()
2524  {
2525  return viennacl::ocl::type_to_string<NumericT>::apply() + "_bisect_kernel";
2526  }
2527 
2528  static void init(viennacl::ocl::context & ctx)
2529  {
2531  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
2532 
2533  static std::map<cl_context, bool> init_done;
2534  if (!init_done[ctx.handle().get()])
2535  {
2536  std::string source;
2537  source.reserve(8192);
2538 
2539  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
2540 
2541  // only generate for floating points (forces error for integers)
2542  if (numeric_string == "float" || numeric_string == "double")
2543  {
2544  //functions used from bisect_util.cpp
2546  generate_bisect_kernel_floorPow2(source, numeric_string);
2547  generate_bisect_kernel_ceilPow2(source, numeric_string);
2548  generate_bisect_kernel_computeMidpoint(source, numeric_string);
2549 
2550  generate_bisect_kernel_storeInterval(source, numeric_string);
2551  generate_bisect_kernel_storeIntervalShort(source, numeric_string);
2552 
2553  generate_bisect_kernel_computeNumSmallerEigenvals(source, numeric_string);
2555 
2556  generate_bisect_kernel_storeNonEmptyIntervals(source, numeric_string);
2558 
2559  generate_bisect_kernel_createIndicesCompaction(source, numeric_string);
2561 
2562  generate_bisect_kernel_compactIntervals(source, numeric_string);
2563  generate_bisect_kernel_compactIntervalsShort(source, numeric_string);
2564 
2565  generate_bisect_kernel_storeIntervalConverged(source, numeric_string);
2567 
2568  generate_bisect_kernel_subdivideActiveInterval(source, numeric_string);
2570 
2571  generate_bisect_kernel_bisectKernel(source, numeric_string);
2574 
2575 
2576  generate_bisect_kernel_writeToGmem(source, numeric_string);
2577  generate_bisect_kernel_compactStreamsFinal(source, numeric_string);
2579  generate_bisect_kernel_scanSumBlocks(source, numeric_string);
2580  generate_bisect_kernel_scanInitial(source, numeric_string);
2581  generate_bisect_kernel_bisectKernelLarge(source, numeric_string);
2582 
2583 
2584  }
2585 
2586  std::string prog_name = program_name();
2587  #ifdef VIENNACL_BUILD_INFO
2588  std::cout << "Creating program " << prog_name << std::endl;
2589  #endif
2590  ctx.add_program(source, prog_name);
2591  init_done[ctx.handle().get()] = true;
2592  } //if
2593  } //init
2594  };
2595 }
2596 }
2597 }
2598 }
2599 
2600 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType &source, std::string const &numeric_string)
Perform second step of bisection algorithm for large matrices for intervals that after the first step...
Definition: bisect.hpp:1292
void generate_bisect_kernel_writeToGmem(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1643
static void init(viennacl::ocl::context &ctx)
Definition: bisect.hpp:2528
void generate_bisect_kernel_bisectKernel(StringType &source, std::string const &numeric_string)
OpenCL kernel for bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix...
Definition: bisect.hpp:1062
void generate_bisect_kernel_subdivideActiveInterval(StringType &source, std::string const &numeric_string)
Subdivide interval if active and not already converged.
Definition: bisect.hpp:944
void generate_bisect_kernel_floorPow2(StringType &source, std::string const &numeric_string)
OpenCL function for computing the next lower power of two of n.
Definition: bisect.hpp:58
void generate_bisect_kernel_bisectKernelLarge(StringType &source, std::string const &numeric_string)
OpenCL kernel for bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix...
Definition: bisect.hpp:2114
void generate_bisect_kernel_compactStreamsFinal(StringType &source, std::string const &numeric_string)
OpenCL function for performing final stream compaction before writing data to global memory...
Definition: bisect.hpp:1732
void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType &source, std::string const &numeric_string)
OpenCL function for computing addresses to obtain compact list of block start addresses.
Definition: bisect.hpp:1848
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
void generate_bisect_kernel_compactIntervals(StringType &source, std::string const &numeric_string)
OpenCL function for performing stream compaction for second child intervals.
Definition: bisect.hpp:719
void generate_bisect_kernel_createIndicesCompaction(StringType &source, std::string const &numeric_string)
OpenCL function for creating indices for compaction.
Definition: bisect.hpp:579
void generate_bisect_kernel_compactIntervalsShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:759
void generate_bisect_kernel_storeIntervalConvergedShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:866
void generate_bisect_kernel_scanSumBlocks(StringType &source, std::string const &numeric_string)
OpenCL function for performing scan to obtain number of eigenvalues before a specific block...
Definition: bisect.hpp:1925
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_bisect_kernel_ceilPow2(StringType &source, std::string const &numeric_string)
OpenCL function for computing the next higher power of two of n.
Definition: bisect.hpp:89
void generate_bisect_kernel_createIndicesCompactionShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:643
void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:348
void generate_bisect_kernel_computeMidpoint(StringType &source, std::string const &numeric_string)
OpenCL function for computing the midpoint of an interval [left, right] avoiding overflow if possible...
Definition: bisect.hpp:121
void generate_bisect_kernel_storeNonEmptyIntervals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:435
void generate_bisect_kernel_config(StringType &source)
Definition: bisect.hpp:43
void generate_bisect_kernel_subdivideActiveIntervalShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:996
void generate_bisect_kernel_computeNumSmallerEigenvals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:278
void generate_bisect_kernel_storeIntervalShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:214
void generate_bisect_kernel_scanInitial(StringType &source, std::string const &numeric_string)
Perform initial scan for compaction of intervals containing one and multiple eigenvalues; also do ini...
Definition: bisect.hpp:1998
void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType &source, std::string const &numeric_string)
OpenCL function for storing all non-empty intervals resulting from the subdivision of the interval cu...
Definition: bisect.hpp:506
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_bisect_kernel_storeInterval(StringType &source, std::string const &numeric_string)
OpenCL function for checking if interval converged and store appropriately.
Definition: bisect.hpp:164
Main kernel class for the generation of the bisection kernels and utilities.
Definition: bisect.hpp:2521
void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType &source, std::string const &numeric_string)
OpenCL kernel for Determining eigenvalues for large matrices for intervals that after the first step ...
Definition: bisect.hpp:1528
void generate_bisect_kernel_storeIntervalConverged(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:798