28 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
29 #define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
42 template <
typename StringType>
46 source.append(
" #define MAX_THREADS_BLOCK 128\n");
47 source.append(
" #define MAX_SMALL_MATRIX 512\n");
48 source.append(
" #define MAX_THREADS_BLOCK_SMALL_MATRIX 512\n");
49 source.append(
" #define MIN_ABS_INTERVAL 5.0e-37\n");
57 template <
typename StringType>
61 source.append(
" inline int \n");
62 source.append(
" floorPow2(int n) \n");
63 source.append(
" { \n");
64 source.append(
" uint glb_id = get_global_id(0); \n");
65 source.append(
" uint grp_id = get_group_id(0); \n");
66 source.append(
" uint grp_nm = get_num_groups(0); \n");
67 source.append(
" uint lcl_id = get_local_id(0); \n");
68 source.append(
" uint lcl_sz = get_local_size(0); \n");
71 source.append(
" if (0 == (n & (n-1))) \n");
72 source.append(
" { \n");
73 source.append(
" return n; \n");
74 source.append(
" } \n");
76 source.append(
" int exp; \n");
77 source.append(
" frexp(( "); source.append(numeric_string); source.append(
" )n, &exp); \n");
78 source.append(
" return (1 << (exp - 1)); \n");
79 source.append(
" } \n");
88 template <
typename StringType>
92 source.append(
" inline int \n");
93 source.append(
" ceilPow2(int n) \n");
94 source.append(
" { \n");
95 source.append(
" uint glb_id = get_global_id(0); \n");
96 source.append(
" uint grp_id = get_group_id(0); \n");
97 source.append(
" uint grp_nm = get_num_groups(0); \n");
98 source.append(
" uint lcl_id = get_local_id(0); \n");
99 source.append(
" uint lcl_sz = get_local_size(0); \n");
103 source.append(
" if (0 == (n & (n-1))) \n");
104 source.append(
" { \n");
105 source.append(
" return n; \n");
106 source.append(
" } \n");
108 source.append(
" int exp; \n");
109 source.append(
" frexp(( "); source.append(numeric_string); source.append(
" )n, &exp); \n");
110 source.append(
" return (1 << exp); \n");
111 source.append(
" } \n");
120 template <
typename StringType>
123 source.append(
" \n");
124 source.append(
" inline "); source.append(numeric_string); source.append(
" \n");
125 source.append(
" computeMidpoint(const "); source.append(numeric_string); source.append(
" left,\n");
126 source.append(
" const "); source.append(numeric_string); source.append(
" right) \n");
127 source.append(
" { \n");
128 source.append(
" uint glb_id = get_global_id(0); \n");
129 source.append(
" uint grp_id = get_group_id(0); \n");
130 source.append(
" uint grp_nm = get_num_groups(0); \n");
131 source.append(
" uint lcl_id = get_local_id(0); \n");
132 source.append(
" uint lcl_sz = get_local_size(0); \n");
133 source.append(
" "); source.append(numeric_string); source.append(
" mid; \n");
135 source.append(
" if (sign(left) == sign(right)) \n");
136 source.append(
" { \n");
137 source.append(
" mid = left + (right - left) * 0.5f; \n");
138 source.append(
" } \n");
139 source.append(
" else \n");
140 source.append(
" { \n");
141 source.append(
" mid = (left + right) * 0.5f; \n");
142 source.append(
" } \n");
144 source.append(
" return mid; \n");
145 source.append(
" } \n");
163 template<
typename StringType>
166 source.append(
" \n");
167 source.append(
" void \n");
168 source.append(
" storeInterval(unsigned int addr, \n");
169 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_left, \n");
170 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_right, \n");
171 source.append(
" __local unsigned int * s_left_count, \n");
172 source.append(
" __local unsigned int * s_right_count, \n");
173 source.append(
" "); source.append(numeric_string); source.append(
" left, \n");
174 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
175 source.append(
" unsigned int left_count, \n");
176 source.append(
" unsigned int right_count, \n");
177 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
178 source.append(
" { \n");
179 source.append(
" uint glb_id = get_global_id(0); \n");
180 source.append(
" uint grp_id = get_group_id(0); \n");
181 source.append(
" uint grp_nm = get_num_groups(0); \n");
182 source.append(
" uint lcl_id = get_local_id(0); \n");
183 source.append(
" uint lcl_sz = get_local_size(0); \n");
185 source.append(
" s_left_count[addr] = left_count; \n");
186 source.append(
" s_right_count[addr] = right_count; \n");
189 source.append(
" "); source.append(numeric_string); source.append(
" t0 = fabs(right - left); \n");
190 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision; \n");
192 source.append(
" if (t0 <= max(( "); source.append(numeric_string); source.append(
" )MIN_ABS_INTERVAL, t1)) \n");
193 source.append(
" { \n");
195 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
198 source.append(
" s_left[addr] = lambda; \n");
199 source.append(
" s_right[addr] = lambda; \n");
200 source.append(
" } \n");
201 source.append(
" else \n");
202 source.append(
" { \n");
205 source.append(
" s_left[addr] = left; \n");
206 source.append(
" s_right[addr] = right; \n");
207 source.append(
" } \n");
209 source.append(
" } \n");
213 template<
typename StringType>
216 source.append(
" \n");
217 source.append(
" void \n");
218 source.append(
" storeIntervalShort(unsigned int addr, \n");
219 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_left, \n");
220 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_right, \n");
221 source.append(
" __local unsigned short * s_left_count, \n");
222 source.append(
" __local unsigned short * s_right_count, \n");
223 source.append(
" "); source.append(numeric_string); source.append(
" left, \n");
224 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
225 source.append(
" unsigned int left_count, \n");
226 source.append(
" unsigned int right_count, \n");
227 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
228 source.append(
" { \n");
229 source.append(
" uint glb_id = get_global_id(0); \n");
230 source.append(
" uint grp_id = get_group_id(0); \n");
231 source.append(
" uint grp_nm = get_num_groups(0); \n");
232 source.append(
" uint lcl_id = get_local_id(0); \n");
233 source.append(
" uint lcl_sz = get_local_size(0); \n");
235 source.append(
" s_left_count[addr] = left_count; \n");
236 source.append(
" s_right_count[addr] = right_count; \n");
239 source.append(
" "); source.append(numeric_string); source.append(
" t0 = fabs(right - left); \n");
240 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision; \n");
242 source.append(
" if (t0 <= max(( "); source.append(numeric_string); source.append(
" )MIN_ABS_INTERVAL, t1)) \n");
243 source.append(
" { \n");
245 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
248 source.append(
" s_left[addr] = lambda; \n");
249 source.append(
" s_right[addr] = lambda; \n");
250 source.append(
" } \n");
251 source.append(
" else \n");
252 source.append(
" { \n");
255 source.append(
" s_left[addr] = left; \n");
256 source.append(
" s_right[addr] = right; \n");
257 source.append(
" } \n");
259 source.append(
" } \n");
277 template <
typename StringType>
280 source.append(
" \n");
281 source.append(
" inline unsigned int \n");
282 source.append(
" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
283 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
284 source.append(
" const unsigned int n, \n");
285 source.append(
" const "); source.append(numeric_string); source.append(
" x, \n");
286 source.append(
" const unsigned int tid, \n");
287 source.append(
" const unsigned int num_intervals_active, \n");
288 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_d, \n");
289 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_s, \n");
290 source.append(
" unsigned int converged \n");
291 source.append(
" ) \n");
292 source.append(
" { \n");
293 source.append(
" uint glb_id = get_global_id(0); \n");
294 source.append(
" uint grp_id = get_group_id(0); \n");
295 source.append(
" uint grp_nm = get_num_groups(0); \n");
296 source.append(
" uint lcl_id = get_local_id(0); \n");
297 source.append(
" uint lcl_sz = get_local_size(0); \n");
300 source.append(
" "); source.append(numeric_string); source.append(
" delta = 1.0f; \n");
301 source.append(
" unsigned int count = 0; \n");
303 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
306 source.append(
" if (lcl_id < n) \n");
307 source.append(
" { \n");
308 source.append(
" s_d[lcl_id] = *(g_d + lcl_id); \n");
309 source.append(
" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
310 source.append(
" } \n");
312 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
315 source.append(
" if ((tid < num_intervals_active) && (0 == converged)) \n");
316 source.append(
" { \n");
320 source.append(
" for (unsigned int k = 0; k < n; ++k) \n");
321 source.append(
" { \n");
322 source.append(
" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
323 source.append(
" count += (delta < 0) ? 1 : 0; \n");
324 source.append(
" } \n");
326 source.append(
" } \n");
328 source.append(
" return count; \n");
329 source.append(
" } \n");
347 template <
typename StringType>
350 source.append(
" \n");
351 source.append(
" inline unsigned int \n");
352 source.append(
" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
353 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
354 source.append(
" const unsigned int n, \n");
355 source.append(
" const "); source.append(numeric_string); source.append(
" x, \n");
356 source.append(
" const unsigned int tid, \n");
357 source.append(
" const unsigned int num_intervals_active, \n");
358 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_d, \n");
359 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_s, \n");
360 source.append(
" unsigned int converged \n");
361 source.append(
" ) \n");
362 source.append(
" { \n");
363 source.append(
" uint glb_id = get_global_id(0); \n");
364 source.append(
" uint grp_id = get_group_id(0); \n");
365 source.append(
" uint grp_nm = get_num_groups(0); \n");
366 source.append(
" uint lcl_id = get_local_id(0); \n");
367 source.append(
" uint lcl_sz = get_local_size(0); \n");
369 source.append(
" "); source.append(numeric_string); source.append(
" delta = 1.0f; \n");
370 source.append(
" unsigned int count = 0; \n");
372 source.append(
" unsigned int rem = n; \n");
375 source.append(
" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
376 source.append(
" { \n");
378 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
381 source.append(
" if ((i + lcl_id) < n) \n");
382 source.append(
" { \n");
384 source.append(
" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
385 source.append(
" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
386 source.append(
" } \n");
388 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
391 source.append(
" if (tid < num_intervals_active) \n");
392 source.append(
" { \n");
396 source.append(
" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
397 source.append(
" { \n");
398 source.append(
" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
400 source.append(
" count += (delta < 0) ? 1 : 0; \n");
401 source.append(
" } \n");
403 source.append(
" } \n");
405 source.append(
" rem -= lcl_sz; \n");
406 source.append(
" } \n");
408 source.append(
" return count; \n");
409 source.append(
" } \n");
434 template<
typename StringType>
437 source.append(
" \n");
438 source.append(
" void \n");
439 source.append(
" storeNonEmptyIntervals(unsigned int addr, \n");
440 source.append(
" const unsigned int num_threads_active, \n");
441 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
442 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
443 source.append(
" __local unsigned int *s_left_count, \n");
444 source.append(
" __local unsigned int *s_right_count, \n");
445 source.append(
" "); source.append(numeric_string); source.append(
" left, \n ");
446 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
447 source.append(
" "); source.append(numeric_string); source.append(
" right,\n");
448 source.append(
" const unsigned int left_count, \n");
449 source.append(
" const unsigned int mid_count, \n");
450 source.append(
" const unsigned int right_count, \n");
451 source.append(
" "); source.append(numeric_string); source.append(
" precision, \n");
452 source.append(
" __local unsigned int *compact_second_chunk, \n");
453 source.append(
" __local unsigned int *s_compaction_list_exc, \n");
454 source.append(
" unsigned int *is_active_second) \n");
455 source.append(
" { \n");
456 source.append(
" uint glb_id = get_global_id(0); \n");
457 source.append(
" uint grp_id = get_group_id(0); \n");
458 source.append(
" uint grp_nm = get_num_groups(0); \n");
459 source.append(
" uint lcl_id = get_local_id(0); \n");
460 source.append(
" uint lcl_sz = get_local_size(0); \n");
463 source.append(
" \n");
464 source.append(
" if ((left_count != mid_count) && (mid_count != right_count)) \n");
465 source.append(
" { \n");
468 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
469 source.append(
" left, mid, left_count, mid_count, precision); \n");
473 source.append(
" *is_active_second = 1; \n");
474 source.append(
" s_compaction_list_exc[lcl_id] = 1; \n");
475 source.append(
" *compact_second_chunk = 1; \n");
476 source.append(
" } \n");
477 source.append(
" else \n");
478 source.append(
" { \n");
483 source.append(
" *is_active_second = 0; \n");
484 source.append(
" s_compaction_list_exc[lcl_id] = 0; \n");
487 source.append(
" if (left_count != mid_count) \n");
488 source.append(
" { \n");
489 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
490 source.append(
" left, mid, left_count, mid_count, precision); \n");
491 source.append(
" } \n");
492 source.append(
" else \n");
493 source.append(
" { \n");
494 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
495 source.append(
" mid, right, mid_count, right_count, precision); \n");
496 source.append(
" } \n");
498 source.append(
" } \n");
499 source.append(
" } \n");
505 template <
typename StringType>
508 source.append(
" \n");
509 source.append(
" void \n");
510 source.append(
" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
511 source.append(
" const unsigned int num_threads_active, \n");
512 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
513 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
514 source.append(
" __local unsigned short *s_left_count, \n");
515 source.append(
" __local unsigned short *s_right_count, \n");
516 source.append(
" "); source.append(numeric_string); source.append(
" left, \n ");
517 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
518 source.append(
" "); source.append(numeric_string); source.append(
" right,\n");
519 source.append(
" const unsigned int left_count, \n");
520 source.append(
" const unsigned int mid_count, \n");
521 source.append(
" const unsigned int right_count, \n");
522 source.append(
" "); source.append(numeric_string); source.append(
" epsilon, \n");
523 source.append(
" __local unsigned int *compact_second_chunk, \n");
524 source.append(
" __local unsigned short *s_compaction_list, \n");
525 source.append(
" unsigned int *is_active_second) \n");
526 source.append(
" { \n");
527 source.append(
" uint glb_id = get_global_id(0); \n");
528 source.append(
" uint grp_id = get_group_id(0); \n");
529 source.append(
" uint grp_nm = get_num_groups(0); \n");
530 source.append(
" uint lcl_id = get_local_id(0); \n");
531 source.append(
" uint lcl_sz = get_local_size(0); \n");
534 source.append(
" if ((left_count != mid_count) && (mid_count != right_count)) \n");
535 source.append(
" { \n");
537 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
538 source.append(
" left, mid, left_count, mid_count, epsilon); \n");
540 source.append(
" *is_active_second = 1; \n");
541 source.append(
" s_compaction_list[lcl_id] = 1; \n");
542 source.append(
" *compact_second_chunk = 1; \n");
543 source.append(
" } \n");
544 source.append(
" else \n");
545 source.append(
" { \n");
550 source.append(
" *is_active_second = 0; \n");
551 source.append(
" s_compaction_list[lcl_id] = 0; \n");
554 source.append(
" if (left_count != mid_count) \n");
555 source.append(
" { \n");
556 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
557 source.append(
" left, mid, left_count, mid_count, epsilon); \n");
558 source.append(
" } \n");
559 source.append(
" else \n");
560 source.append(
" { \n");
561 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
562 source.append(
" mid, right, mid_count, right_count, epsilon); \n");
563 source.append(
" } \n");
564 source.append(
" } \n");
565 source.append(
" } \n");
578 template<
typename StringType>
581 (void)numeric_string;
582 source.append(
" \n");
583 source.append(
" void \n");
584 source.append(
" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
585 source.append(
" unsigned int num_threads_compaction) \n");
586 source.append(
" { \n");
587 source.append(
" uint glb_id = get_global_id(0); \n");
588 source.append(
" uint grp_id = get_group_id(0); \n");
589 source.append(
" uint grp_nm = get_num_groups(0); \n");
590 source.append(
" uint lcl_id = get_local_id(0); \n");
591 source.append(
" uint lcl_sz = get_local_size(0); \n");
594 source.append(
" unsigned int offset = 1; \n");
595 source.append(
" const unsigned int tid = lcl_id; \n");
600 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
601 source.append(
" { \n");
603 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
605 source.append(
" if (tid < d) \n");
606 source.append(
" { \n");
608 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
609 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
610 source.append(
" \n");
611 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
612 source.append(
" + s_compaction_list_exc[ai]; \n");
613 source.append(
" } \n");
615 source.append(
" offset <<= 1; \n");
616 source.append(
" } \n");
619 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
620 source.append(
" { \n");
622 source.append(
" offset >>= 1; \n");
623 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
625 source.append(
" if (tid < (d-1)) \n");
626 source.append(
" { \n");
628 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
629 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
631 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
632 source.append(
" + s_compaction_list_exc[ai]; \n");
633 source.append(
" } \n");
634 source.append(
" } \n");
636 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
638 source.append(
" } \n");
642 template<
typename StringType>
645 (void)numeric_string;
646 source.append(
" \n");
647 source.append(
" void \n");
648 source.append(
" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
649 source.append(
" unsigned int num_threads_compaction) \n");
650 source.append(
" { \n");
651 source.append(
" uint glb_id = get_global_id(0); \n");
652 source.append(
" uint grp_id = get_group_id(0); \n");
653 source.append(
" uint grp_nm = get_num_groups(0); \n");
654 source.append(
" uint lcl_id = get_local_id(0); \n");
655 source.append(
" uint lcl_sz = get_local_size(0); \n");
658 source.append(
" unsigned int offset = 1; \n");
659 source.append(
" const unsigned int tid = lcl_id; \n");
662 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
663 source.append(
" { \n");
665 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
667 source.append(
" if (tid < d) \n");
668 source.append(
" { \n");
670 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
671 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
672 source.append(
" \n");
673 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
674 source.append(
" + s_compaction_list_exc[ai]; \n");
675 source.append(
" } \n");
677 source.append(
" offset <<= 1; \n");
678 source.append(
" } \n");
681 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
682 source.append(
" { \n");
684 source.append(
" offset >>= 1; \n");
685 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
687 source.append(
" if (tid < (d-1)) \n");
688 source.append(
" { \n");
690 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
691 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
693 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
694 source.append(
" + s_compaction_list_exc[ai]; \n");
695 source.append(
" } \n");
696 source.append(
" } \n");
698 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
700 source.append(
" } \n");
718 template<
typename StringType>
721 source.append(
" \n");
722 source.append(
" void \n");
723 source.append(
" compactIntervals(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
724 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
725 source.append(
" __local unsigned int *s_left_count, \n");
726 source.append(
" __local unsigned int *s_right_count, \n");
727 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
728 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
729 source.append(
" unsigned int mid_count, unsigned int right_count, \n");
730 source.append(
" __local unsigned int *s_compaction_list, \n");
731 source.append(
" unsigned int num_threads_active, \n");
732 source.append(
" unsigned int is_active_second) \n");
733 source.append(
" { \n");
734 source.append(
" uint glb_id = get_global_id(0); \n");
735 source.append(
" uint grp_id = get_group_id(0); \n");
736 source.append(
" uint grp_nm = get_num_groups(0); \n");
737 source.append(
" uint lcl_id = get_local_id(0); \n");
738 source.append(
" uint lcl_sz = get_local_size(0); \n");
740 source.append(
" const unsigned int tid = lcl_id; \n");
744 source.append(
" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
745 source.append(
" { \n");
746 source.append(
" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
747 source.append(
" s_left[addr_w] = mid; \n");
748 source.append(
" s_right[addr_w] = right; \n");
749 source.append(
" s_left_count[addr_w] = mid_count; \n");
750 source.append(
" s_right_count[addr_w] = right_count; \n");
751 source.append(
" } \n");
752 source.append(
" } \n");
758 template<
typename StringType>
761 source.append(
" \n");
762 source.append(
" void \n");
763 source.append(
" compactIntervalsShort(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
764 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
765 source.append(
" __local unsigned short *s_left_count, \n");
766 source.append(
" __local unsigned short *s_right_count, \n");
767 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
768 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
769 source.append(
" unsigned int mid_count, unsigned int right_count, \n");
770 source.append(
" __local unsigned short *s_compaction_list, \n");
771 source.append(
" unsigned int num_threads_active, \n");
772 source.append(
" unsigned int is_active_second) \n");
773 source.append(
" { \n");
774 source.append(
" uint glb_id = get_global_id(0); \n");
775 source.append(
" uint grp_id = get_group_id(0); \n");
776 source.append(
" uint grp_nm = get_num_groups(0); \n");
777 source.append(
" uint lcl_id = get_local_id(0); \n");
778 source.append(
" uint lcl_sz = get_local_size(0); \n");
780 source.append(
" const unsigned int tid = lcl_id; \n");
784 source.append(
" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
785 source.append(
" { \n");
786 source.append(
" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
787 source.append(
" s_left[addr_w] = mid; \n");
788 source.append(
" s_right[addr_w] = right; \n");
789 source.append(
" s_left_count[addr_w] = mid_count; \n");
790 source.append(
" s_right_count[addr_w] = right_count; \n");
791 source.append(
" } \n");
792 source.append(
" } \n");
797 template<
typename StringType>
800 source.append(
" \n");
801 source.append(
" void \n");
802 source.append(
" storeIntervalConverged( __local "); source.append(numeric_string); source.append(
" *s_left, \n");
803 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
804 source.append(
" __local unsigned int *s_left_count, \n");
805 source.append(
" __local unsigned int *s_right_count, \n");
806 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
807 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
808 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
809 source.append(
" unsigned int *left_count, \n");
810 source.append(
" unsigned int *mid_count, \n");
811 source.append(
" unsigned int *right_count, \n");
812 source.append(
" __local unsigned int *s_compaction_list_exc, \n");
813 source.append(
" __local unsigned int *compact_second_chunk, \n");
814 source.append(
" const unsigned int num_threads_active, \n");
815 source.append(
" unsigned int *is_active_second) \n");
816 source.append(
" { \n");
817 source.append(
" uint glb_id = get_global_id(0); \n");
818 source.append(
" uint grp_id = get_group_id(0); \n");
819 source.append(
" uint grp_nm = get_num_groups(0); \n");
820 source.append(
" uint lcl_id = get_local_id(0); \n");
821 source.append(
" uint lcl_sz = get_local_size(0); \n");
823 source.append(
" const unsigned int tid = lcl_id; \n");
824 source.append(
" const unsigned int multiplicity = *right_count - *left_count; \n");
826 source.append(
" if (1 == multiplicity) \n");
827 source.append(
" { \n");
830 source.append(
" s_left[tid] = *left; \n");
831 source.append(
" s_right[tid] = *right; \n");
832 source.append(
" s_left_count[tid] = *left_count; \n");
833 source.append(
" s_right_count[tid] = *right_count; \n");
834 source.append(
" \n");
837 source.append(
" *is_active_second = 0; \n");
838 source.append(
" s_compaction_list_exc[tid] = 0; \n");
839 source.append(
" } \n");
840 source.append(
" else \n");
841 source.append(
" { \n");
844 source.append(
" *mid_count = *left_count + (multiplicity >> 1); \n");
847 source.append(
" s_left[tid] = *left; \n");
848 source.append(
" s_right[tid] = *right; \n");
849 source.append(
" s_left_count[tid] = *left_count; \n");
850 source.append(
" s_right_count[tid] = *mid_count; \n");
851 source.append(
" *mid = *left; \n");
854 source.append(
" *is_active_second = 1; \n");
855 source.append(
" s_compaction_list_exc[tid] = 1; \n");
856 source.append(
" *compact_second_chunk = 1; \n");
857 source.append(
" } \n");
858 source.append(
" } \n");
865 template<
typename StringType>
868 source.append(
" \n");
869 source.append(
" void \n");
870 source.append(
" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
871 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
872 source.append(
" __local unsigned short *s_left_count, \n");
873 source.append(
" __local unsigned short *s_right_count, \n");
874 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
875 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
876 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
877 source.append(
" unsigned int *left_count, \n");
878 source.append(
" unsigned int *mid_count, \n");
879 source.append(
" unsigned int *right_count, \n");
880 source.append(
" __local unsigned short *s_compaction_list_exc, \n");
881 source.append(
" __local unsigned int *compact_second_chunk, \n");
882 source.append(
" const unsigned int num_threads_active, \n");
883 source.append(
" unsigned int *is_active_second) \n");
884 source.append(
" { \n");
885 source.append(
" uint glb_id = get_global_id(0); \n");
886 source.append(
" uint grp_id = get_group_id(0); \n");
887 source.append(
" uint grp_nm = get_num_groups(0); \n");
888 source.append(
" uint lcl_id = get_local_id(0); \n");
889 source.append(
" uint lcl_sz = get_local_size(0); \n");
891 source.append(
" const unsigned int tid = lcl_id; \n");
892 source.append(
" const unsigned int multiplicity = *right_count - *left_count; \n");
894 source.append(
" if (1 == multiplicity) \n");
895 source.append(
" { \n");
898 source.append(
" s_left[tid] = *left; \n");
899 source.append(
" s_right[tid] = *right; \n");
900 source.append(
" s_left_count[tid] = *left_count; \n");
901 source.append(
" s_right_count[tid] = *right_count; \n");
902 source.append(
" \n");
905 source.append(
" *is_active_second = 0; \n");
906 source.append(
" s_compaction_list_exc[tid] = 0; \n");
907 source.append(
" } \n");
908 source.append(
" else \n");
909 source.append(
" { \n");
912 source.append(
" *mid_count = *left_count + (multiplicity >> 1); \n");
915 source.append(
" s_left[tid] = *left; \n");
916 source.append(
" s_right[tid] = *right; \n");
917 source.append(
" s_left_count[tid] = *left_count; \n");
918 source.append(
" s_right_count[tid] = *mid_count; \n");
919 source.append(
" *mid = *left; \n");
922 source.append(
" *is_active_second = 1; \n");
923 source.append(
" s_compaction_list_exc[tid] = 1; \n");
924 source.append(
" *compact_second_chunk = 1; \n");
925 source.append(
" } \n");
926 source.append(
" } \n");
943 template<
typename StringType>
946 source.append(
" \n");
947 source.append(
" void \n");
948 source.append(
" subdivideActiveInterval(const unsigned int tid, \n");
949 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
950 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
951 source.append(
" __local unsigned int *s_left_count, \n");
952 source.append(
" __local unsigned int *s_right_count, \n");
953 source.append(
" const unsigned int num_threads_active, \n");
954 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
955 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
956 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
957 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
958 source.append(
" __local unsigned int *all_threads_converged) \n");
959 source.append(
" { \n");
960 source.append(
" uint glb_id = get_global_id(0); \n");
961 source.append(
" uint grp_id = get_group_id(0); \n");
962 source.append(
" uint grp_nm = get_num_groups(0); \n");
963 source.append(
" uint lcl_id = get_local_id(0); \n");
964 source.append(
" uint lcl_sz = get_local_size(0); \n");
967 source.append(
" if (tid < num_threads_active) \n");
968 source.append(
" { \n");
970 source.append(
" *left = s_left[tid]; \n");
971 source.append(
" *right = s_right[tid]; \n");
972 source.append(
" *left_count = s_left_count[tid]; \n");
973 source.append(
" *right_count = s_right_count[tid]; \n");
976 source.append(
" if (*left != *right) \n");
977 source.append(
" { \n");
979 source.append(
" *mid = computeMidpoint(*left, *right); \n");
980 source.append(
" *all_threads_converged = 0; \n");
981 source.append(
" } \n");
982 source.append(
" else if ((*right_count - *left_count) > 1) \n");
983 source.append(
" { \n");
986 source.append(
" *all_threads_converged = 0; \n");
987 source.append(
" } \n");
989 source.append(
" } \n");
991 source.append(
" } \n");
995 template<
typename StringType>
998 source.append(
" \n");
999 source.append(
" void \n");
1000 source.append(
" subdivideActiveIntervalShort(const unsigned int tid, \n");
1001 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1002 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1003 source.append(
" __local unsigned short *s_left_count, \n");
1004 source.append(
" __local unsigned short *s_right_count, \n");
1005 source.append(
" const unsigned int num_threads_active, \n");
1006 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
1007 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
1008 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
1009 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
1010 source.append(
" __local unsigned int *all_threads_converged) \n");
1011 source.append(
" { \n");
1012 source.append(
" uint glb_id = get_global_id(0); \n");
1013 source.append(
" uint grp_id = get_group_id(0); \n");
1014 source.append(
" uint grp_nm = get_num_groups(0); \n");
1015 source.append(
" uint lcl_id = get_local_id(0); \n");
1016 source.append(
" uint lcl_sz = get_local_size(0); \n");
1019 source.append(
" if (tid < num_threads_active) \n");
1020 source.append(
" { \n");
1022 source.append(
" *left = s_left[tid]; \n");
1023 source.append(
" *right = s_right[tid]; \n");
1024 source.append(
" *left_count = s_left_count[tid]; \n");
1025 source.append(
" *right_count = s_right_count[tid]; \n");
1028 source.append(
" if (*left != *right) \n");
1029 source.append(
" { \n");
1031 source.append(
" *mid = computeMidpoint(*left, *right); \n");
1032 source.append(
" *all_threads_converged = 0; \n");
1033 source.append(
" } \n");
1034 source.append(
" else if ((*right_count - *left_count) > 1) \n");
1035 source.append(
" { \n");
1038 source.append(
" *all_threads_converged = 0; \n");
1039 source.append(
" } \n");
1041 source.append(
" } \n");
1043 source.append(
" } \n");
1061 template <
typename StringType>
1064 source.append(
" __kernel \n");
1065 source.append(
" void \n");
1066 source.append(
" bisectKernel(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1067 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1068 source.append(
" const unsigned int n, \n");
1069 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1070 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1071 source.append(
" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
1072 source.append(
" const "); source.append(numeric_string); source.append(
" lg, \n");
1073 source.append(
" const "); source.append(numeric_string); source.append(
" ug, \n");
1074 source.append(
" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
1075 source.append(
" "); source.append(numeric_string); source.append(
" epsilon \n");
1076 source.append(
" ) \n");
1077 source.append(
" { \n");
1078 source.append(
" g_s = g_s + 1; \n");
1079 source.append(
" uint glb_id = get_global_id(0); \n");
1080 source.append(
" uint grp_id = get_group_id(0); \n");
1081 source.append(
" uint grp_nm = get_num_groups(0); \n");
1082 source.append(
" uint lcl_id = get_local_id(0); \n");
1083 source.append(
" uint lcl_sz = get_local_size(0); \n");
1087 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1088 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1092 source.append(
" __local unsigned int s_left_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1093 source.append(
" __local unsigned int s_right_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1096 source.append(
" __local unsigned int \n");
1097 source.append(
" s_compaction_list[MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
1102 source.append(
" __local unsigned int compact_second_chunk; \n");
1103 source.append(
" __local unsigned int all_threads_converged; \n");
1106 source.append(
" __local unsigned int num_threads_active; \n");
1109 source.append(
" __local unsigned int num_threads_compaction; \n");
1112 source.append(
" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1117 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
1118 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
1119 source.append(
" unsigned int left_count = 0; \n");
1120 source.append(
" unsigned int right_count = 0; \n");
1122 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1124 source.append(
" unsigned int mid_count = 0; \n");
1126 source.append(
" unsigned int is_active_second = 0; \n");
1128 source.append(
" s_compaction_list[lcl_id] = 0; \n");
1129 source.append(
" s_left[lcl_id] = 0.0; \n");
1130 source.append(
" s_right[lcl_id] = 0.0; \n");
1131 source.append(
" s_left_count[lcl_id] = 0; \n");
1132 source.append(
" s_right_count[lcl_id] = 0; \n");
1134 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1137 source.append(
" if (0 == lcl_id) \n");
1138 source.append(
" { \n");
1139 source.append(
" s_left[0] = lg; \n");
1140 source.append(
" s_right[0] = ug; \n");
1141 source.append(
" s_left_count[0] = lg_eig_count; \n");
1142 source.append(
" s_right_count[0] = ug_eig_count; \n");
1144 source.append(
" compact_second_chunk = 0; \n");
1145 source.append(
" num_threads_active = 1; \n");
1147 source.append(
" num_threads_compaction = 1; \n");
1148 source.append(
" } \n");
1153 source.append(
" while (true) \n");
1154 source.append(
" { \n");
1156 source.append(
" all_threads_converged = 1; \n");
1157 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1159 source.append(
" is_active_second = 0; \n");
1160 source.append(
" subdivideActiveInterval(lcl_id, \n");
1161 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1162 source.append(
" num_threads_active, \n");
1163 source.append(
" &left, &right, &left_count, &right_count, \n");
1164 source.append(
" &mid, &all_threads_converged); \n");
1166 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1169 source.append(
" if (1 == all_threads_converged) \n");
1170 source.append(
" { \n");
1171 source.append(
" break; \n");
1172 source.append(
" } \n");
1174 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1181 source.append(
" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
1182 source.append(
" lcl_id, num_threads_active, \n");
1183 source.append(
" s_left, s_right, \n");
1184 source.append(
" (left == right)); \n");
1186 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1198 source.append(
" if (lcl_id < num_threads_active) \n");
1199 source.append(
" { \n");
1201 source.append(
" if (left != right) \n");
1202 source.append(
" { \n");
1205 source.append(
" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
1206 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1207 source.append(
" left, mid, right, \n");
1208 source.append(
" left_count, mid_count, right_count, \n");
1209 source.append(
" epsilon, &compact_second_chunk, \n");
1210 source.append(
" s_compaction_list_exc, \n");
1211 source.append(
" &is_active_second); \n");
1212 source.append(
" } \n");
1213 source.append(
" else \n");
1214 source.append(
" { \n");
1216 source.append(
" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1217 source.append(
" &left, &mid, &right, \n");
1218 source.append(
" &left_count, &mid_count, &right_count, \n");
1219 source.append(
" s_compaction_list_exc, &compact_second_chunk, \n");
1220 source.append(
" num_threads_active, \n");
1221 source.append(
" &is_active_second); \n");
1222 source.append(
" } \n");
1223 source.append(
" } \n");
1226 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1231 source.append(
" if (compact_second_chunk > 0) \n");
1232 source.append(
" { \n");
1234 source.append(
" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1236 source.append(
" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1237 source.append(
" mid, right, mid_count, right_count, \n");
1238 source.append(
" s_compaction_list, num_threads_active, \n");
1239 source.append(
" is_active_second); \n");
1240 source.append(
" } \n");
1242 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1244 source.append(
" if (0 == lcl_id) \n");
1245 source.append(
" { \n");
1248 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
1250 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1252 source.append(
" compact_second_chunk = 0; \n");
1253 source.append(
" } \n");
1255 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1257 source.append(
" } \n");
1259 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1266 source.append(
" if (lcl_id < n) \n");
1267 source.append(
" { \n");
1269 source.append(
" g_left[lcl_id] = s_left[lcl_id]; \n");
1271 source.append(
" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
1272 source.append(
" } \n");
1273 source.append(
" } \n");
1291 template <
typename StringType>
1294 source.append(
" __kernel \n");
1295 source.append(
" void \n");
1296 source.append(
" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1297 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1298 source.append(
" const unsigned int n, \n");
1299 source.append(
" __global unsigned int *blocks_mult, \n");
1300 source.append(
" __global unsigned int *blocks_mult_sum, \n");
1301 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1302 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1303 source.append(
" __global unsigned int *g_left_count, \n");
1304 source.append(
" __global unsigned int *g_right_count, \n");
1305 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_lambda, \n");
1306 source.append(
" __global unsigned int *g_pos, \n");
1307 source.append(
" "); source.append(numeric_string); source.append(
" precision \n");
1308 source.append(
" ) \n");
1309 source.append(
" { \n");
1310 source.append(
" g_s = g_s + 1; \n");
1311 source.append(
" uint glb_id = get_global_id(0); \n");
1312 source.append(
" uint grp_id = get_group_id(0); \n");
1313 source.append(
" uint grp_nm = get_num_groups(0); \n");
1314 source.append(
" uint lcl_id = get_local_id(0); \n");
1315 source.append(
" uint lcl_sz = get_local_size(0); \n");
1317 source.append(
" const unsigned int tid = lcl_id; \n");
1320 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[2 * MAX_THREADS_BLOCK]; \n");
1321 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[2 * MAX_THREADS_BLOCK]; \n");
1324 source.append(
" __local unsigned int s_left_count[2 * MAX_THREADS_BLOCK]; \n");
1325 source.append(
" __local unsigned int s_right_count[2 * MAX_THREADS_BLOCK]; \n");
1328 source.append(
" __local unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; \n");
1330 source.append(
" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1333 source.append(
" __local unsigned int all_threads_converged; \n");
1335 source.append(
" __local unsigned int num_threads_active; \n");
1337 source.append(
" __local unsigned int num_threads_compaction; \n");
1339 source.append(
" __local unsigned int compact_second_chunk; \n");
1342 source.append(
" __local unsigned int c_block_start; \n");
1343 source.append(
" __local unsigned int c_block_end; \n");
1344 source.append(
" __local unsigned int c_block_offset_output; \n");
1347 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1349 source.append(
" unsigned int mid_count = 0; \n");
1351 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
1352 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
1353 source.append(
" unsigned int left_count = 0; \n");
1354 source.append(
" unsigned int right_count = 0; \n");
1356 source.append(
" unsigned int is_active_second = 0; \n");
1358 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1361 source.append(
" if (0 == tid) \n");
1362 source.append(
" { \n");
1364 source.append(
" c_block_start = blocks_mult[grp_id]; \n");
1365 source.append(
" c_block_end = blocks_mult[grp_id + 1]; \n");
1366 source.append(
" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
1367 source.append(
" \n");
1369 source.append(
" num_threads_active = c_block_end - c_block_start; \n");
1370 source.append(
" s_compaction_list[0] = 0; \n");
1371 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1373 source.append(
" all_threads_converged = 1; \n");
1374 source.append(
" compact_second_chunk = 0; \n");
1375 source.append(
" } \n");
1376 source.append(
" s_left_count [tid] = 42; \n");
1377 source.append(
" s_right_count[tid] = 42; \n");
1378 source.append(
" s_left_count [tid + MAX_THREADS_BLOCK] = 0; \n");
1379 source.append(
" s_right_count[tid + MAX_THREADS_BLOCK] = 0; \n");
1380 source.append(
" \n");
1381 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1382 source.append(
" \n");
1385 source.append(
" if (tid < num_threads_active) \n");
1386 source.append(
" { \n");
1388 source.append(
" s_left[tid] = g_left[c_block_start + tid]; \n");
1389 source.append(
" s_right[tid] = g_right[c_block_start + tid]; \n");
1390 source.append(
" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
1391 source.append(
" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
1392 source.append(
" \n");
1393 source.append(
" } \n");
1394 source.append(
" \n");
1395 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1396 source.append(
" unsigned int iter = 0; \n");
1398 source.append(
" while (true) \n");
1399 source.append(
" { \n");
1400 source.append(
" iter++; \n");
1402 source.append(
" s_compaction_list[lcl_id] = 0; \n");
1403 source.append(
" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
1404 source.append(
" s_compaction_list[2 * MAX_THREADS_BLOCK] = 0; \n");
1407 source.append(
" subdivideActiveInterval(tid, s_left, s_right, \n");
1408 source.append(
" s_left_count, s_right_count, \n");
1409 source.append(
" num_threads_active, \n");
1410 source.append(
" &left, &right, &left_count, &right_count, \n");
1411 source.append(
" &mid, &all_threads_converged); \n");
1412 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1415 source.append(
" if (1 == all_threads_converged) \n");
1416 source.append(
" { \n");
1417 source.append(
" \n");
1418 source.append(
" break; \n");
1419 source.append(
" } \n");
1425 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1426 source.append(
" mid, tid, num_threads_active, \n");
1427 source.append(
" s_left, s_right, \n");
1428 source.append(
" (left == right)); \n");
1429 source.append(
" \n");
1430 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1432 source.append(
" if (tid < num_threads_active) \n");
1433 source.append(
" { \n");
1434 source.append(
" \n");
1436 source.append(
" if (left != right) \n");
1437 source.append(
" { \n");
1439 source.append(
" storeNonEmptyIntervals(tid, num_threads_active, \n");
1440 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1441 source.append(
" left, mid, right, \n");
1442 source.append(
" left_count, mid_count, right_count, \n");
1443 source.append(
" precision, &compact_second_chunk, \n");
1444 source.append(
" s_compaction_list_exc, \n");
1445 source.append(
" &is_active_second); \n");
1446 source.append(
" \n");
1447 source.append(
" } \n");
1448 source.append(
" else \n");
1449 source.append(
" { \n");
1451 source.append(
" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1452 source.append(
" &left, &mid, &right, \n");
1453 source.append(
" &left_count, &mid_count, &right_count, \n");
1454 source.append(
" s_compaction_list_exc, &compact_second_chunk, \n");
1455 source.append(
" num_threads_active, \n");
1456 source.append(
" &is_active_second); \n");
1457 source.append(
" \n");
1458 source.append(
" } \n");
1459 source.append(
" } \n");
1461 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1465 source.append(
" if (1 == compact_second_chunk) \n");
1466 source.append(
" { \n");
1468 source.append(
" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1469 source.append(
" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1470 source.append(
" mid, right, mid_count, right_count, \n");
1471 source.append(
" s_compaction_list, num_threads_active, \n");
1472 source.append(
" is_active_second); \n");
1473 source.append(
" } \n");
1475 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1478 source.append(
" if (0 == tid) \n");
1479 source.append(
" { \n");
1480 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
1481 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1483 source.append(
" compact_second_chunk = 0; \n");
1484 source.append(
" all_threads_converged = 1; \n");
1485 source.append(
" } \n");
1487 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1490 source.append(
" s_compaction_list_exc[lcl_id] = 0; \n");
1491 source.append(
" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
1492 source.append(
" \n");
1493 source.append(
" if (num_threads_compaction > lcl_sz) \n");
1494 source.append(
" { \n");
1495 source.append(
" break; \n");
1496 source.append(
" } \n");
1499 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1501 source.append(
" } \n");
1504 source.append(
" if (tid < num_threads_active) \n");
1505 source.append(
" { \n");
1507 source.append(
" unsigned int addr = c_block_offset_output + tid; \n");
1508 source.append(
" \n");
1509 source.append(
" g_lambda[addr] = s_left[tid]; \n");
1510 source.append(
" g_pos[addr] = s_right_count[tid]; \n");
1511 source.append(
" } \n");
1512 source.append(
" } \n");
1527 template <
typename StringType>
1530 source.append(
" __kernel \n");
1531 source.append(
" void \n");
1532 source.append(
" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1533 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1534 source.append(
" const unsigned int n, \n");
1535 source.append(
" unsigned int num_intervals, \n");
1536 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1537 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1538 source.append(
" __global unsigned int *g_pos, \n");
1539 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
1540 source.append(
" { \n");
1541 source.append(
" g_s = g_s + 1; \n");
1542 source.append(
" uint glb_id = get_global_id(0); \n");
1543 source.append(
" uint grp_id = get_group_id(0); \n");
1544 source.append(
" uint grp_nm = get_num_groups(0); \n");
1545 source.append(
" uint lcl_id = get_local_id(0); \n");
1546 source.append(
" uint lcl_sz = get_local_size(0); \n");
1547 source.append(
" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
1548 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left_scratch[MAX_THREADS_BLOCK]; \n");
1549 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right_scratch[MAX_THREADS_BLOCK]; \n");
1552 source.append(
" "); source.append(numeric_string); source.append(
" left, right; \n");
1555 source.append(
" unsigned int right_count; \n");
1557 source.append(
" unsigned int converged = 0; \n");
1559 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1561 source.append(
" unsigned int mid_count = 0; \n");
1564 source.append(
" if (gtid < num_intervals) \n");
1565 source.append(
" { \n");
1566 source.append(
" left = g_left[gtid]; \n");
1567 source.append(
" right = g_right[gtid]; \n");
1568 source.append(
" right_count = g_pos[gtid]; \n");
1569 source.append(
" } \n");
1571 source.append(
" __local unsigned int converged_all_threads; \n");
1573 source.append(
" if (0 == lcl_id) \n");
1574 source.append(
" { \n");
1575 source.append(
" converged_all_threads = 0; \n");
1576 source.append(
" } \n");
1577 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1579 source.append(
" while (true) \n");
1580 source.append(
" { \n");
1581 source.append(
" converged_all_threads = 1; \n");
1583 source.append(
" if ((gtid < num_intervals) && (0 == converged)) \n");
1584 source.append(
" { \n");
1585 source.append(
" mid = computeMidpoint(left, right); \n");
1586 source.append(
" } \n");
1588 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1589 source.append(
" mid, gtid, num_intervals, \n");
1590 source.append(
" s_left_scratch, \n");
1591 source.append(
" s_right_scratch, \n");
1592 source.append(
" converged); \n");
1593 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1595 source.append(
" if ((gtid < num_intervals) && (0 == converged)) \n");
1596 source.append(
" { \n");
1598 source.append(
" if (right_count == mid_count) \n");
1599 source.append(
" { \n");
1600 source.append(
" right = mid; \n");
1601 source.append(
" } \n");
1602 source.append(
" else \n");
1603 source.append(
" { \n");
1604 source.append(
" left = mid; \n");
1605 source.append(
" } \n");
1607 source.append(
" "); source.append(numeric_string); source.append(
" t0 = right - left; \n");
1608 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(right), fabs(left)) * precision; \n");
1610 source.append(
" if (t0 < min(precision, t1)) \n");
1611 source.append(
" { \n");
1612 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
1613 source.append(
" left = lambda; \n");
1614 source.append(
" right = lambda; \n");
1616 source.append(
" converged = 1; \n");
1617 source.append(
" } \n");
1618 source.append(
" else \n");
1619 source.append(
" { \n");
1620 source.append(
" converged_all_threads = 0; \n");
1621 source.append(
" } \n");
1622 source.append(
" } \n");
1623 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1624 source.append(
" if (1 == converged_all_threads) \n");
1625 source.append(
" { \n");
1626 source.append(
" break; \n");
1627 source.append(
" } \n");
1628 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1629 source.append(
" } \n");
1631 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1632 source.append(
" if (gtid < num_intervals) \n");
1633 source.append(
" { \n");
1636 source.append(
" g_left[gtid] = left; \n");
1637 source.append(
" } \n");
1638 source.append(
" } \n");
1642 template <
typename StringType>
1645 source.append(
" \n");
1646 source.append(
" void writeToGmem(const unsigned int tid, const unsigned int tid_2, \n");
1647 source.append(
" const unsigned int num_threads_active, \n");
1648 source.append(
" const unsigned int num_blocks_mult, \n");
1649 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_one, \n");
1650 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_one, \n");
1651 source.append(
" __global unsigned int *g_pos_one, \n");
1652 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_mult, \n");
1653 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_mult, \n");
1654 source.append(
" __global unsigned int *g_left_count_mult, \n");
1655 source.append(
" __global unsigned int *g_right_count_mult, \n");
1656 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1657 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1658 source.append(
" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1659 source.append(
" __global unsigned int *g_blocks_mult, \n");
1660 source.append(
" __global unsigned int *g_blocks_mult_sum, \n");
1661 source.append(
" __local unsigned short *s_compaction_list, \n");
1662 source.append(
" __local unsigned short *s_cl_helper, \n");
1663 source.append(
" unsigned int offset_mult_lambda \n");
1664 source.append(
" ) \n");
1665 source.append(
" { \n");
1666 source.append(
" uint glb_id = get_global_id(0); \n");
1667 source.append(
" uint grp_id = get_group_id(0); \n");
1668 source.append(
" uint grp_nm = get_num_groups(0); \n");
1669 source.append(
" uint lcl_id = get_local_id(0); \n");
1670 source.append(
" uint lcl_sz = get_local_size(0); \n");
1673 source.append(
" if (tid < offset_mult_lambda) \n");
1674 source.append(
" { \n");
1676 source.append(
" g_left_one[tid] = s_left[tid]; \n");
1677 source.append(
" g_right_one[tid] = s_right[tid]; \n");
1679 source.append(
" g_pos_one[tid] = s_right_count[tid]; \n");
1680 source.append(
" } \n");
1681 source.append(
" else \n");
1682 source.append(
" { \n");
1684 source.append(
" \n");
1685 source.append(
" g_left_mult[tid - offset_mult_lambda] = s_left[tid]; \n");
1686 source.append(
" g_right_mult[tid - offset_mult_lambda] = s_right[tid]; \n");
1687 source.append(
" g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; \n");
1688 source.append(
" g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; \n");
1689 source.append(
" } \n");
1691 source.append(
" if (tid_2 < num_threads_active) \n");
1692 source.append(
" { \n");
1694 source.append(
" if (tid_2 < offset_mult_lambda) \n");
1695 source.append(
" { \n");
1697 source.append(
" g_left_one[tid_2] = s_left[tid_2]; \n");
1698 source.append(
" g_right_one[tid_2] = s_right[tid_2]; \n");
1700 source.append(
" g_pos_one[tid_2] = s_right_count[tid_2]; \n");
1701 source.append(
" } \n");
1702 source.append(
" else \n");
1703 source.append(
" { \n");
1705 source.append(
" g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; \n");
1706 source.append(
" g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; \n");
1707 source.append(
" g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; \n");
1708 source.append(
" g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; \n");
1709 source.append(
" } \n");
1711 source.append(
" } \n");
1713 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1717 source.append(
" if (tid <= num_blocks_mult) \n");
1718 source.append(
" { \n");
1719 source.append(
" g_blocks_mult[tid] = s_compaction_list[tid]; \n");
1720 source.append(
" g_blocks_mult_sum[tid] = s_cl_helper[tid]; \n");
1721 source.append(
" } \n");
1722 source.append(
" if (tid_2 <= num_blocks_mult) \n");
1723 source.append(
" { \n");
1724 source.append(
" g_blocks_mult[tid_2] = s_compaction_list[tid_2]; \n");
1725 source.append(
" g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; \n");
1726 source.append(
" } \n");
1727 source.append(
" } \n");
1731 template <
typename StringType>
1734 source.append(
" \n");
1735 source.append(
" void \n");
1736 source.append(
" compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, \n");
1737 source.append(
" const unsigned int num_threads_active, \n");
1738 source.append(
" __local unsigned int *offset_mult_lambda, \n");
1739 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1740 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1741 source.append(
" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1742 source.append(
" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
1743 source.append(
" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper, \n");
1744 source.append(
" unsigned int is_one_lambda, unsigned int is_one_lambda_2, \n");
1745 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
1746 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
1747 source.append(
" "); source.append(numeric_string); source.append(
" *left_2, \n");
1748 source.append(
" "); source.append(numeric_string); source.append(
" *right_2, \n");
1749 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
1750 source.append(
" unsigned int *left_count_2, unsigned int *right_count_2, \n");
1751 source.append(
" unsigned int c_block_iend, unsigned int c_sum_block, \n");
1752 source.append(
" unsigned int c_block_iend_2, unsigned int c_sum_block_2 \n");
1753 source.append(
" ) \n");
1754 source.append(
" { \n");
1755 source.append(
" uint glb_id = get_global_id(0); \n");
1756 source.append(
" uint grp_id = get_group_id(0); \n");
1757 source.append(
" uint grp_nm = get_num_groups(0); \n");
1758 source.append(
" uint lcl_id = get_local_id(0); \n");
1759 source.append(
" uint lcl_sz = get_local_size(0); \n");
1762 source.append(
" *left = s_left[tid]; \n");
1763 source.append(
" *right = s_right[tid]; \n");
1765 source.append(
" if (tid_2 < num_threads_active) \n");
1766 source.append(
" { \n");
1767 source.append(
" *left_2 = s_left[tid_2]; \n");
1768 source.append(
" *right_2 = s_right[tid_2]; \n");
1769 source.append(
" } \n");
1771 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1775 source.append(
" unsigned int ptr_w = 0; \n");
1776 source.append(
" unsigned int ptr_w_2 = 0; \n");
1777 source.append(
" unsigned int ptr_blocking_w = 0; \n");
1778 source.append(
" unsigned int ptr_blocking_w_2 = 0; \n");
1779 source.append(
" \n");
1780 source.append(
" \n");
1782 source.append(
" ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] \n");
1783 source.append(
" : s_cl_mult[tid] + *offset_mult_lambda; \n");
1785 source.append(
" if (0 != c_block_iend) \n");
1786 source.append(
" { \n");
1787 source.append(
" ptr_blocking_w = s_cl_blocking[tid]; \n");
1788 source.append(
" } \n");
1790 source.append(
" if (tid_2 < num_threads_active) \n");
1791 source.append(
" { \n");
1792 source.append(
" ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] \n");
1793 source.append(
" : s_cl_mult[tid_2] + *offset_mult_lambda; \n");
1795 source.append(
" if (0 != c_block_iend_2) \n");
1796 source.append(
" { \n");
1797 source.append(
" ptr_blocking_w_2 = s_cl_blocking[tid_2]; \n");
1798 source.append(
" } \n");
1799 source.append(
" } \n");
1800 source.append(
" \n");
1801 source.append(
" \n");
1802 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1803 source.append(
" \n");
1805 source.append(
" s_left[ptr_w] = *left; \n");
1806 source.append(
" s_right[ptr_w] = *right; \n");
1807 source.append(
" s_left_count[ptr_w] = *left_count; \n");
1808 source.append(
" s_right_count[ptr_w] = *right_count; \n");
1809 source.append(
" \n");
1810 source.append(
" \n");
1811 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1812 source.append(
" if(tid == 1) \n");
1813 source.append(
" { \n");
1814 source.append(
" s_left[ptr_w] = *left; \n");
1815 source.append(
" s_right[ptr_w] = *right; \n");
1816 source.append(
" s_left_count[ptr_w] = *left_count; \n");
1817 source.append(
" s_right_count[ptr_w] = *right_count; \n");
1818 source.append(
" \n");
1819 source.append(
" } \n");
1820 source.append(
" if (0 != c_block_iend) \n");
1821 source.append(
" { \n");
1822 source.append(
" s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; \n");
1823 source.append(
" s_cl_helper[ptr_blocking_w + 1] = c_sum_block; \n");
1824 source.append(
" } \n");
1825 source.append(
" \n");
1826 source.append(
" if (tid_2 < num_threads_active) \n");
1827 source.append(
" { \n");
1829 source.append(
" s_left[ptr_w_2] = *left_2; \n");
1830 source.append(
" s_right[ptr_w_2] = *right_2; \n");
1831 source.append(
" s_left_count[ptr_w_2] = *left_count_2; \n");
1832 source.append(
" s_right_count[ptr_w_2] = *right_count_2; \n");
1834 source.append(
" if (0 != c_block_iend_2) \n");
1835 source.append(
" { \n");
1836 source.append(
" s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; \n");
1837 source.append(
" s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; \n");
1838 source.append(
" } \n");
1839 source.append(
" } \n");
1841 source.append(
" } \n");
1847 template <
typename StringType>
1850 (void)numeric_string;
1851 source.append(
" \n");
1852 source.append(
" void \n");
1853 source.append(
" scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, \n");
1854 source.append(
" const unsigned int num_threads_compaction, \n");
1855 source.append(
" __local unsigned short *s_cl_blocking, \n");
1856 source.append(
" __local unsigned short *s_cl_helper \n");
1857 source.append(
" ) \n");
1858 source.append(
" { \n");
1859 source.append(
" uint glb_id = get_global_id(0); \n");
1860 source.append(
" uint grp_id = get_group_id(0); \n");
1861 source.append(
" uint grp_nm = get_num_groups(0); \n");
1862 source.append(
" uint lcl_id = get_local_id(0); \n");
1863 source.append(
" uint lcl_sz = get_local_size(0); \n");
1867 source.append(
" s_cl_blocking[tid] = s_cl_helper[tid]; \n");
1869 source.append(
" if (tid_2 < num_threads_compaction) \n");
1870 source.append(
" { \n");
1871 source.append(
" s_cl_blocking[tid_2] = s_cl_helper[tid_2]; \n");
1872 source.append(
" } \n");
1874 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1881 source.append(
" unsigned int offset = 1; \n");
1884 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
1885 source.append(
" { \n");
1887 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1889 source.append(
" if (tid < d) \n");
1890 source.append(
" { \n");
1892 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
1893 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
1894 source.append(
" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1895 source.append(
" } \n");
1897 source.append(
" offset <<= 1; \n");
1898 source.append(
" } \n");
1901 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
1902 source.append(
" { \n");
1904 source.append(
" offset >>= 1; \n");
1905 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1908 source.append(
" if (tid < (d-1)) \n");
1909 source.append(
" { \n");
1911 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
1912 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
1913 source.append(
" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1914 source.append(
" } \n");
1915 source.append(
" } \n");
1917 source.append(
" } \n");
1924 template <
typename StringType>
1927 (void)numeric_string;
1928 source.append(
" \n");
1929 source.append(
" void \n");
1930 source.append(
" scanSumBlocks(const unsigned int tid, const unsigned int tid_2, \n");
1931 source.append(
" const unsigned int num_threads_active, \n");
1932 source.append(
" const unsigned int num_threads_compaction, \n");
1933 source.append(
" __local unsigned short *s_cl_blocking, \n");
1934 source.append(
" __local unsigned short *s_cl_helper) \n");
1935 source.append(
" { \n");
1936 source.append(
" uint glb_id = get_global_id(0); \n");
1937 source.append(
" uint grp_id = get_group_id(0); \n");
1938 source.append(
" uint grp_nm = get_num_groups(0); \n");
1939 source.append(
" uint lcl_id = get_local_id(0); \n");
1940 source.append(
" uint lcl_sz = get_local_size(0); \n");
1942 source.append(
" unsigned int offset = 1; \n");
1946 source.append(
" for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) \n");
1947 source.append(
" { \n");
1949 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1951 source.append(
" if (tid < d) \n");
1952 source.append(
" { \n");
1954 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
1955 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
1957 source.append(
" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1958 source.append(
" } \n");
1960 source.append(
" offset *= 2; \n");
1961 source.append(
" } \n");
1965 source.append(
" for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) \n");
1966 source.append(
" { \n");
1968 source.append(
" offset >>= 1; \n");
1969 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1971 source.append(
" if (tid < (d-1)) \n");
1972 source.append(
" { \n");
1973 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
1974 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
1975 source.append(
" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1976 source.append(
" } \n");
1977 source.append(
" } \n");
1978 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1980 source.append(
" if (0 == tid) \n");
1981 source.append(
" { \n");
1986 source.append(
" s_cl_helper[num_threads_active - 1] = \n");
1987 source.append(
" s_cl_helper[num_threads_compaction - 1]; \n");
1988 source.append(
" s_cl_blocking[num_threads_active - 1] = \n");
1989 source.append(
" s_cl_blocking[num_threads_compaction - 1]; \n");
1990 source.append(
" } \n");
1991 source.append(
" } \n");
1997 template <
typename StringType>
2000 (void)numeric_string;
2001 source.append(
" \n");
2002 source.append(
" void \n");
2003 source.append(
" scanInitial(const unsigned int tid, const unsigned int tid_2, \n");
2004 source.append(
" const unsigned int num_threads_active, \n");
2005 source.append(
" const unsigned int num_threads_compaction, \n");
2006 source.append(
" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
2007 source.append(
" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper \n");
2008 source.append(
" ) \n");
2009 source.append(
" { \n");
2010 source.append(
" uint glb_id = get_global_id(0); \n");
2011 source.append(
" uint grp_id = get_group_id(0); \n");
2012 source.append(
" uint grp_nm = get_num_groups(0); \n");
2013 source.append(
" uint lcl_id = get_local_id(0); \n");
2014 source.append(
" uint lcl_sz = get_local_size(0); \n");
2022 source.append(
" unsigned int offset = 1; \n");
2025 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
2026 source.append(
" { \n");
2028 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2030 source.append(
" if (tid < d) \n");
2031 source.append(
" { \n");
2033 source.append(
" unsigned int ai = offset*(2*tid+1); \n");
2034 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
2036 source.append(
" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; \n");
2037 source.append(
" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; \n");
2045 source.append(
" if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) \n");
2046 source.append(
" { \n");
2049 source.append(
" if (s_cl_helper[ai - 1] == 1) \n");
2050 source.append(
" { \n");
2052 source.append(
" s_cl_helper[bi] = 1; \n");
2053 source.append(
" } \n");
2054 source.append(
" else if (s_cl_helper[bi] == 1) \n");
2055 source.append(
" { \n");
2057 source.append(
" s_cl_helper[ai - 1] = 1; \n");
2058 source.append(
" } \n");
2059 source.append(
" else \n");
2060 source.append(
" { \n");
2062 source.append(
" unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; \n");
2064 source.append(
" if (temp > MAX_THREADS_BLOCK) \n");
2065 source.append(
" { \n");
2068 source.append(
" s_cl_helper[ai - 1] = 1; \n");
2069 source.append(
" s_cl_helper[bi] = 1; \n");
2070 source.append(
" } \n");
2071 source.append(
" else \n");
2072 source.append(
" { \n");
2074 source.append(
" s_cl_blocking[bi] = temp; \n");
2075 source.append(
" s_cl_blocking[ai - 1] = 0; \n");
2076 source.append(
" } \n");
2077 source.append(
" } \n");
2078 source.append(
" } \n");
2079 source.append(
" } \n");
2080 source.append(
" offset <<= 1; \n");
2081 source.append(
" } \n");
2086 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
2087 source.append(
" { \n");
2088 source.append(
" offset >>= 1; \n");
2089 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2091 source.append(
" if (tid < (d-1)) \n");
2092 source.append(
" { \n");
2093 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
2094 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
2095 source.append(
" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; \n");
2096 source.append(
" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; \n");
2097 source.append(
" } \n");
2098 source.append(
" } \n");
2099 source.append(
" } \n");
2113 template <
typename StringType>
2116 source.append(
" __kernel \n");
2117 source.append(
" void \n");
2118 source.append(
" bisectKernelLarge(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
2119 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
2120 source.append(
" const unsigned int n, \n");
2121 source.append(
" const "); source.append(numeric_string); source.append(
" lg, \n");
2122 source.append(
" const "); source.append(numeric_string); source.append(
" ug, \n");
2123 source.append(
" const unsigned int lg_eig_count, \n");
2124 source.append(
" const unsigned int ug_eig_count, \n");
2125 source.append(
" "); source.append(numeric_string); source.append(
" epsilon, \n");
2126 source.append(
" __global unsigned int *g_num_one, \n");
2127 source.append(
" __global unsigned int *g_num_blocks_mult, \n");
2128 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_one, \n");
2129 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_one, \n");
2130 source.append(
" __global unsigned int *g_pos_one, \n");
2131 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_mult, \n");
2132 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_mult, \n");
2133 source.append(
" __global unsigned int *g_left_count_mult, \n");
2134 source.append(
" __global unsigned int *g_right_count_mult, \n");
2135 source.append(
" __global unsigned int *g_blocks_mult, \n");
2136 source.append(
" __global unsigned int *g_blocks_mult_sum \n");
2137 source.append(
" ) \n");
2138 source.append(
" { \n");
2139 source.append(
" g_s = g_s + 1; \n");
2140 source.append(
" uint glb_id = get_global_id(0); \n");
2141 source.append(
" uint grp_id = get_group_id(0); \n");
2142 source.append(
" uint grp_nm = get_num_groups(0); \n");
2143 source.append(
" uint lcl_id = get_local_id(0); \n");
2144 source.append(
" uint lcl_sz = get_local_size(0); \n");
2146 source.append(
" const unsigned int tid = lcl_id; \n");
2150 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[2 * MAX_THREADS_BLOCK + 1]; \n");
2151 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[2 * MAX_THREADS_BLOCK + 1]; \n");
2155 source.append(
" __local unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1]; \n");
2156 source.append(
" __local unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1]; \n");
2159 source.append(
" __local unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; \n");
2164 source.append(
" __local unsigned int compact_second_chunk; \n");
2166 source.append(
" __local unsigned int all_threads_converged; \n");
2169 source.append(
" __local unsigned int num_threads_active; \n");
2172 source.append(
" __local unsigned int num_threads_compaction; \n");
2175 source.append(
" __local unsigned short *s_compaction_list_exc = s_compaction_list + 1; \n");
2179 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
2180 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
2181 source.append(
" unsigned int left_count = 0; \n");
2182 source.append(
" unsigned int right_count = 0; \n");
2184 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
2186 source.append(
" unsigned int mid_count = 0; \n");
2188 source.append(
" unsigned int is_active_second = 0; \n");
2191 source.append(
" s_compaction_list[tid] = 0; \n");
2192 source.append(
" s_left[tid] = 0; \n");
2193 source.append(
" s_right[tid] = 0; \n");
2194 source.append(
" s_left_count[tid] = 0; \n");
2195 source.append(
" s_right_count[tid] = 0; \n");
2197 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2200 source.append(
" if (0 == tid) \n");
2201 source.append(
" { \n");
2203 source.append(
" s_left[0] = lg; \n");
2204 source.append(
" s_right[0] = ug; \n");
2205 source.append(
" s_left_count[0] = lg_eig_count; \n");
2206 source.append(
" s_right_count[0] = ug_eig_count; \n");
2208 source.append(
" compact_second_chunk = 0; \n");
2209 source.append(
" num_threads_active = 1; \n");
2211 source.append(
" num_threads_compaction = 1; \n");
2213 source.append(
" all_threads_converged = 1; \n");
2214 source.append(
" } \n");
2216 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2222 source.append(
" for( unsigned int i = 0; i < 15; ++i ) \n");
2223 source.append(
" { \n");
2224 source.append(
" s_compaction_list[tid] = 0; \n");
2225 source.append(
" s_compaction_list[tid + MAX_THREADS_BLOCK] = 0; \n");
2226 source.append(
" s_compaction_list[2 * MAX_THREADS_BLOCK] = 0; \n");
2227 source.append(
" subdivideActiveIntervalShort(tid, s_left, s_right, s_left_count, s_right_count, \n");
2228 source.append(
" num_threads_active, \n");
2229 source.append(
" &left, &right, &left_count, &right_count, \n");
2230 source.append(
" &mid, &all_threads_converged); \n");
2232 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2235 source.append(
" if (1 == all_threads_converged) \n");
2236 source.append(
" { \n");
2237 source.append(
" break; \n");
2238 source.append(
" } \n");
2245 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
2246 source.append(
" mid, lcl_id, \n");
2247 source.append(
" num_threads_active, \n");
2248 source.append(
" s_left, s_right, \n");
2249 source.append(
" (left == right)); \n");
2251 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2263 source.append(
" if (tid < num_threads_active) \n");
2264 source.append(
" { \n");
2266 source.append(
" if (left != right) \n");
2267 source.append(
" { \n");
2270 source.append(
" storeNonEmptyIntervalsLarge(tid, num_threads_active, \n");
2271 source.append(
" s_left, s_right, \n");
2272 source.append(
" s_left_count, s_right_count, \n");
2273 source.append(
" left, mid, right, \n");
2274 source.append(
" left_count, mid_count, right_count, \n");
2275 source.append(
" epsilon, &compact_second_chunk, \n");
2276 source.append(
" s_compaction_list_exc, \n");
2277 source.append(
" &is_active_second); \n");
2278 source.append(
" } \n");
2279 source.append(
" else \n");
2280 source.append(
" { \n");
2285 source.append(
" s_left[tid] = left; \n");
2286 source.append(
" s_right[tid] = left; \n");
2287 source.append(
" s_left_count[tid] = left_count; \n");
2288 source.append(
" s_right_count[tid] = right_count; \n");
2290 source.append(
" is_active_second = 0; \n");
2291 source.append(
" } \n");
2292 source.append(
" } \n");
2295 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2300 source.append(
" if (compact_second_chunk > 0) \n");
2301 source.append(
" { \n");
2304 source.append(
" createIndicesCompactionShort(s_compaction_list_exc, num_threads_compaction); \n");
2305 source.append(
" } \n");
2306 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2307 source.append(
" \n");
2308 source.append(
" if (compact_second_chunk > 0) \n");
2309 source.append(
" { \n");
2310 source.append(
" compactIntervalsShort(s_left, s_right, s_left_count, s_right_count, \n");
2311 source.append(
" mid, right, mid_count, right_count, \n");
2312 source.append(
" s_compaction_list, num_threads_active, \n");
2313 source.append(
" is_active_second); \n");
2314 source.append(
" } \n");
2316 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2319 source.append(
" if (0 == tid) \n");
2320 source.append(
" { \n");
2323 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
2324 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
2326 source.append(
" compact_second_chunk = 0; \n");
2327 source.append(
" all_threads_converged = 1; \n");
2328 source.append(
" } \n");
2329 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2330 source.append(
" if (num_threads_compaction > lcl_sz) \n");
2331 source.append(
" { \n");
2332 source.append(
" break; \n");
2333 source.append(
" } \n");
2334 source.append(
" } \n");
2335 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2343 source.append(
" unsigned int left_count_2; \n");
2344 source.append(
" unsigned int right_count_2; \n");
2346 source.append(
" unsigned int tid_2 = tid + lcl_sz; \n");
2350 source.append(
" left_count = s_left_count[tid]; \n");
2351 source.append(
" right_count = s_right_count[tid]; \n");
2354 source.append(
" if (tid_2 < num_threads_active) \n");
2355 source.append(
" { \n");
2356 source.append(
" left_count_2 = s_left_count[tid_2]; \n");
2357 source.append(
" right_count_2 = s_right_count[tid_2]; \n");
2358 source.append(
" } \n");
2362 source.append(
" __local unsigned short *s_cl_one = s_left_count + 1; \n");
2363 source.append(
" __local unsigned short *s_cl_mult = s_right_count + 1; \n");
2367 source.append(
" __local unsigned short *s_cl_blocking = s_compaction_list_exc; \n");
2369 source.append(
" __local unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1]; \n");
2371 source.append(
" if (0 == tid) \n");
2372 source.append(
" { \n");
2374 source.append(
" s_left_count[0] = 0; \n");
2375 source.append(
" s_right_count[0] = 0; \n");
2376 source.append(
" \n");
2377 source.append(
" } \n");
2379 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2382 source.append(
" unsigned int is_one_lambda = 0; \n");
2383 source.append(
" unsigned int is_one_lambda_2 = 0; \n");
2386 source.append(
" unsigned int multiplicity = right_count - left_count; \n");
2387 source.append(
" is_one_lambda = (1 == multiplicity); \n");
2389 source.append(
" s_cl_one[tid] = is_one_lambda; \n");
2390 source.append(
" s_cl_mult[tid] = (! is_one_lambda); \n");
2393 source.append(
" s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; \n");
2394 source.append(
" s_cl_helper[tid] = 0; \n");
2396 source.append(
" if (tid_2 < num_threads_active) \n");
2397 source.append(
" { \n");
2399 source.append(
" unsigned int multiplicity = right_count_2 - left_count_2; \n");
2400 source.append(
" is_one_lambda_2 = (1 == multiplicity); \n");
2402 source.append(
" s_cl_one[tid_2] = is_one_lambda_2; \n");
2403 source.append(
" s_cl_mult[tid_2] = (! is_one_lambda_2); \n");
2406 source.append(
" s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; \n");
2407 source.append(
" s_cl_helper[tid_2] = 0; \n");
2408 source.append(
" } \n");
2409 source.append(
" else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) \n");
2410 source.append(
" { \n");
2413 source.append(
" s_cl_blocking[tid_2] = 0; \n");
2414 source.append(
" s_cl_helper[tid_2] = 0; \n");
2415 source.append(
" } \n");
2418 source.append(
" scanInitial(tid, tid_2, num_threads_active, num_threads_compaction, \n");
2419 source.append(
" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); \n");
2420 source.append(
" \n");
2421 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2423 source.append(
" scanSumBlocks(tid, tid_2, num_threads_active, \n");
2424 source.append(
" num_threads_compaction, s_cl_blocking, s_cl_helper); \n");
2427 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2429 source.append(
" unsigned int c_block_iend = 0; \n");
2430 source.append(
" unsigned int c_block_iend_2 = 0; \n");
2431 source.append(
" unsigned int c_sum_block = 0; \n");
2432 source.append(
" unsigned int c_sum_block_2 = 0; \n");
2439 source.append(
" if (1 == s_cl_helper[tid]) \n");
2440 source.append(
" { \n");
2442 source.append(
" c_block_iend = s_cl_mult[tid] + 1; \n");
2443 source.append(
" c_sum_block = s_cl_blocking[tid]; \n");
2444 source.append(
" } \n");
2446 source.append(
" if (1 == s_cl_helper[tid_2]) \n");
2447 source.append(
" { \n");
2449 source.append(
" c_block_iend_2 = s_cl_mult[tid_2] + 1; \n");
2450 source.append(
" c_sum_block_2 = s_cl_blocking[tid_2]; \n");
2451 source.append(
" } \n");
2453 source.append(
" scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, \n");
2454 source.append(
" s_cl_blocking, s_cl_helper); \n");
2458 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2461 source.append(
" __local unsigned int num_blocks_mult; \n");
2462 source.append(
" __local unsigned int num_mult; \n");
2463 source.append(
" __local unsigned int offset_mult_lambda; \n");
2465 source.append(
" if (0 == tid) \n");
2466 source.append(
" { \n");
2468 source.append(
" num_blocks_mult = s_cl_blocking[num_threads_active - 1]; \n");
2469 source.append(
" offset_mult_lambda = s_cl_one[num_threads_active - 1]; \n");
2470 source.append(
" num_mult = s_cl_mult[num_threads_active - 1]; \n");
2472 source.append(
" *g_num_one = offset_mult_lambda; \n");
2473 source.append(
" *g_num_blocks_mult = num_blocks_mult; \n");
2474 source.append(
" } \n");
2476 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2478 source.append(
" "); source.append(numeric_string); source.append(
" left_2, right_2; \n");
2479 source.append(
" --s_cl_one; \n");
2480 source.append(
" --s_cl_mult; \n");
2481 source.append(
" --s_cl_blocking; \n");
2482 source.append(
" \n");
2483 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2484 source.append(
" compactStreamsFinal(tid, tid_2, num_threads_active, &offset_mult_lambda, \n");
2485 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
2486 source.append(
" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, \n");
2487 source.append(
" is_one_lambda, is_one_lambda_2, \n");
2488 source.append(
" &left, &right, &left_2, &right_2, \n");
2489 source.append(
" &left_count, &right_count, &left_count_2, &right_count_2, \n");
2490 source.append(
" c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 \n");
2491 source.append(
" ); \n");
2493 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2496 source.append(
" if (0 == tid) \n");
2497 source.append(
" { \n");
2498 source.append(
" s_cl_blocking[num_blocks_mult] = num_mult; \n");
2499 source.append(
" s_cl_helper[0] = 0; \n");
2500 source.append(
" } \n");
2502 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2505 source.append(
" writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, \n");
2506 source.append(
" g_left_one, g_right_one, g_pos_one, \n");
2507 source.append(
" g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, \n");
2508 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
2509 source.append(
" g_blocks_mult, g_blocks_mult_sum, \n");
2510 source.append(
" s_compaction_list, s_cl_helper, offset_mult_lambda); \n");
2511 source.append(
" \n");
2513 source.append(
" } \n");
2520 template <
class NumericT>
2533 static std::map<cl_context, bool> init_done;
2537 source.reserve(8192);
2539 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
2542 if (numeric_string ==
"float" || numeric_string ==
"double")
2587 #ifdef VIENNACL_BUILD_INFO
2588 std::cout <<
"Creating program " << prog_name << std::endl;
2590 ctx.add_program(source, prog_name);
2591 init_done[ctx.handle().get()] =
true;
2600 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType &source, std::string const &numeric_string)
Perform second step of bisection algorithm for large matrices for intervals that after the first step...
void generate_bisect_kernel_writeToGmem(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_bisect_kernel_bisectKernel(StringType &source, std::string const &numeric_string)
OpenCL kernel for bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix...
void generate_bisect_kernel_subdivideActiveInterval(StringType &source, std::string const &numeric_string)
Subdivide interval if active and not already converged.
void generate_bisect_kernel_floorPow2(StringType &source, std::string const &numeric_string)
OpenCL function for computing the next lower power of two of n.
void generate_bisect_kernel_bisectKernelLarge(StringType &source, std::string const &numeric_string)
OpenCL kernel for bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix...
void generate_bisect_kernel_compactStreamsFinal(StringType &source, std::string const &numeric_string)
OpenCL function for performing final stream compaction before writing data to global memory...
void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType &source, std::string const &numeric_string)
OpenCL function for computing addresses to obtain compact list of block start addresses.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_bisect_kernel_compactIntervals(StringType &source, std::string const &numeric_string)
OpenCL function for performing stream compaction for second child intervals.
void generate_bisect_kernel_createIndicesCompaction(StringType &source, std::string const &numeric_string)
OpenCL function for creating indices for compaction.
void generate_bisect_kernel_compactIntervalsShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalConvergedShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_scanSumBlocks(StringType &source, std::string const &numeric_string)
OpenCL function for performing scan to obtain number of eigenvalues before a specific block...
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static void apply(viennacl::ocl::context const &)
static std::string program_name()
const OCL_TYPE & get() const
void generate_bisect_kernel_ceilPow2(StringType &source, std::string const &numeric_string)
OpenCL function for computing the next higher power of two of n.
void generate_bisect_kernel_createIndicesCompactionShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeMidpoint(StringType &source, std::string const &numeric_string)
OpenCL function for computing the midpoint of an interval [left, right] avoiding overflow if possible...
void generate_bisect_kernel_storeNonEmptyIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_config(StringType &source)
void generate_bisect_kernel_subdivideActiveIntervalShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeNumSmallerEigenvals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_scanInitial(StringType &source, std::string const &numeric_string)
Perform initial scan for compaction of intervals containing one and multiple eigenvalues; also do ini...
void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType &source, std::string const &numeric_string)
OpenCL function for storing all non-empty intervals resulting from the subdivision of the interval cu...
Helper class for converting a type to its string representation.
void generate_bisect_kernel_storeInterval(StringType &source, std::string const &numeric_string)
OpenCL function for checking if interval converged and store appropriately.
Main kernel class for the generation of the bisection kernels and utilities.
void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType &source, std::string const &numeric_string)
OpenCL kernel for Determining eigenvalues for large matrices for intervals that after the first step ...
void generate_bisect_kernel_storeIntervalConverged(StringType &source, std::string const &numeric_string)