1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
37 template<
typename NumericT>
41 unsigned int A_row_start,
42 unsigned int A_col_start,
43 unsigned int A_row_inc,
44 unsigned int A_col_inc,
45 unsigned int A_row_size,
46 unsigned int A_col_size,
47 unsigned int A_internal_rows,
48 unsigned int A_internal_cols,
50 unsigned int B_row_start,
51 unsigned int B_col_start,
52 unsigned int B_row_inc,
53 unsigned int B_col_inc,
54 unsigned int B_row_size,
55 unsigned int B_col_size,
56 unsigned int B_internal_rows,
57 unsigned int B_internal_cols,
60 unsigned int C_row_start,
61 unsigned int C_col_start,
62 unsigned int C_row_inc,
63 unsigned int C_col_inc,
64 unsigned int C_row_size,
65 unsigned int C_col_size,
66 unsigned int C_internal_rows,
67 unsigned int C_internal_cols)
70 __shared__ NumericT bufA[272];
71 __shared__ NumericT bufB[272];
78 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
79 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
80 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
82 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
84 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
85 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
87 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
88 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
93 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
94 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
96 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
97 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
98 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
99 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
100 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
101 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
102 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
103 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
104 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
105 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
106 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
107 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
108 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
109 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
110 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
111 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
112 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
113 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
118 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
119 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
124 template<
typename NumericT>
128 unsigned int A_row_start,
129 unsigned int A_col_start,
130 unsigned int A_row_inc,
131 unsigned int A_col_inc,
132 unsigned int A_row_size,
133 unsigned int A_col_size,
134 unsigned int A_internal_rows,
135 unsigned int A_internal_cols,
137 unsigned int B_row_start,
138 unsigned int B_col_start,
139 unsigned int B_row_inc,
140 unsigned int B_col_inc,
141 unsigned int B_row_size,
142 unsigned int B_col_size,
143 unsigned int B_internal_rows,
144 unsigned int B_internal_cols,
147 unsigned int C_row_start,
148 unsigned int C_col_start,
149 unsigned int C_row_inc,
150 unsigned int C_col_inc,
151 unsigned int C_row_size,
152 unsigned int C_col_size,
153 unsigned int C_internal_rows,
154 unsigned int C_internal_cols)
157 __shared__ NumericT bufA[272];
158 __shared__ NumericT bufB[272];
165 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
166 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
167 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
168 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
169 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
171 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
172 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
174 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
175 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
180 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
181 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
183 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
184 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
185 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
186 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
187 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
188 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
189 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
190 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
191 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
192 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
193 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
194 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
195 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
196 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
197 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
198 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
199 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
200 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
205 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
206 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
211 template<
typename NumericT>
215 unsigned int A_row_start,
216 unsigned int A_col_start,
217 unsigned int A_row_inc,
218 unsigned int A_col_inc,
219 unsigned int A_row_size,
220 unsigned int A_col_size,
221 unsigned int A_internal_rows,
222 unsigned int A_internal_cols,
224 unsigned int B_row_start,
225 unsigned int B_col_start,
226 unsigned int B_row_inc,
227 unsigned int B_col_inc,
228 unsigned int B_row_size,
229 unsigned int B_col_size,
230 unsigned int B_internal_rows,
231 unsigned int B_internal_cols,
234 unsigned int C_row_start,
235 unsigned int C_col_start,
236 unsigned int C_row_inc,
237 unsigned int C_col_inc,
238 unsigned int C_row_size,
239 unsigned int C_col_size,
240 unsigned int C_internal_rows,
241 unsigned int C_internal_cols)
244 __shared__ NumericT bufA[272];
245 __shared__ NumericT bufB[272];
252 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
254 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
256 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
258 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
259 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
261 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
262 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
267 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
268 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
270 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
271 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
272 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
273 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
274 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
275 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
276 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
277 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
278 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
279 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
280 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
281 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
282 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
283 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
284 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
285 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
286 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
287 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
292 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
293 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
298 template<
typename NumericT>
302 unsigned int A_row_start,
303 unsigned int A_col_start,
304 unsigned int A_row_inc,
305 unsigned int A_col_inc,
306 unsigned int A_row_size,
307 unsigned int A_col_size,
308 unsigned int A_internal_rows,
309 unsigned int A_internal_cols,
311 unsigned int B_row_start,
312 unsigned int B_col_start,
313 unsigned int B_row_inc,
314 unsigned int B_col_inc,
315 unsigned int B_row_size,
316 unsigned int B_col_size,
317 unsigned int B_internal_rows,
318 unsigned int B_internal_cols,
321 unsigned int C_row_start,
322 unsigned int C_col_start,
323 unsigned int C_row_inc,
324 unsigned int C_col_inc,
325 unsigned int C_row_size,
326 unsigned int C_col_size,
327 unsigned int C_internal_rows,
328 unsigned int C_internal_cols)
331 __shared__ NumericT bufA[272];
332 __shared__ NumericT bufB[272];
339 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
341 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
342 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
343 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
345 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
346 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
348 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
349 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
354 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
355 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
357 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
358 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
359 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
360 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
361 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
362 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
363 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
364 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
365 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
366 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
367 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
368 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
369 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
370 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
371 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
372 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
373 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
374 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
379 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
380 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
392 template<
typename NumericT>
396 unsigned int A_row_start,
397 unsigned int A_col_start,
398 unsigned int A_row_inc,
399 unsigned int A_col_inc,
400 unsigned int A_row_size,
401 unsigned int A_col_size,
402 unsigned int A_internal_rows,
403 unsigned int A_internal_cols,
405 unsigned int B_row_start,
406 unsigned int B_col_start,
407 unsigned int B_row_inc,
408 unsigned int B_col_inc,
409 unsigned int B_row_size,
410 unsigned int B_col_size,
411 unsigned int B_internal_rows,
412 unsigned int B_internal_cols,
415 unsigned int C_row_start,
416 unsigned int C_col_start,
417 unsigned int C_row_inc,
418 unsigned int C_col_inc,
419 unsigned int C_row_size,
420 unsigned int C_col_size,
421 unsigned int C_internal_rows,
422 unsigned int C_internal_cols)
425 __shared__ NumericT bufA[272];
426 __shared__ NumericT bufB[272];
433 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
434 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
435 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
437 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
439 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
440 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
442 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
443 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
448 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
449 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
451 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
452 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
453 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
454 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
455 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
456 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
457 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
458 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
459 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
460 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
461 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
462 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
463 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
464 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
465 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
466 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
467 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
468 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
473 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
474 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
479 template<
typename NumericT>
483 unsigned int A_row_start,
484 unsigned int A_col_start,
485 unsigned int A_row_inc,
486 unsigned int A_col_inc,
487 unsigned int A_row_size,
488 unsigned int A_col_size,
489 unsigned int A_internal_rows,
490 unsigned int A_internal_cols,
492 unsigned int B_row_start,
493 unsigned int B_col_start,
494 unsigned int B_row_inc,
495 unsigned int B_col_inc,
496 unsigned int B_row_size,
497 unsigned int B_col_size,
498 unsigned int B_internal_rows,
499 unsigned int B_internal_cols,
502 unsigned int C_row_start,
503 unsigned int C_col_start,
504 unsigned int C_row_inc,
505 unsigned int C_col_inc,
506 unsigned int C_row_size,
507 unsigned int C_col_size,
508 unsigned int C_internal_rows,
509 unsigned int C_internal_cols)
512 __shared__ NumericT bufA[272];
513 __shared__ NumericT bufB[272];
520 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
521 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
522 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
523 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
524 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
526 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
527 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
529 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
530 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
535 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
536 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
538 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
539 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
540 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
541 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
542 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
543 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
544 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
545 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
546 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
547 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
548 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
549 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
550 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
551 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
552 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
553 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
554 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
555 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
560 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
561 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
566 template<
typename NumericT>
570 unsigned int A_row_start,
571 unsigned int A_col_start,
572 unsigned int A_row_inc,
573 unsigned int A_col_inc,
574 unsigned int A_row_size,
575 unsigned int A_col_size,
576 unsigned int A_internal_rows,
577 unsigned int A_internal_cols,
579 unsigned int B_row_start,
580 unsigned int B_col_start,
581 unsigned int B_row_inc,
582 unsigned int B_col_inc,
583 unsigned int B_row_size,
584 unsigned int B_col_size,
585 unsigned int B_internal_rows,
586 unsigned int B_internal_cols,
589 unsigned int C_row_start,
590 unsigned int C_col_start,
591 unsigned int C_row_inc,
592 unsigned int C_col_inc,
593 unsigned int C_row_size,
594 unsigned int C_col_size,
595 unsigned int C_internal_rows,
596 unsigned int C_internal_cols)
599 __shared__ NumericT bufA[272];
600 __shared__ NumericT bufB[272];
607 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
609 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
611 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
613 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
614 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
616 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
617 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
622 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
623 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
625 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
626 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
627 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
628 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
629 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
630 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
631 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
632 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
633 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
634 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
635 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
636 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
637 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
638 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
639 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
640 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
641 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
642 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
647 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
648 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
653 template<
typename NumericT>
657 unsigned int A_row_start,
658 unsigned int A_col_start,
659 unsigned int A_row_inc,
660 unsigned int A_col_inc,
661 unsigned int A_row_size,
662 unsigned int A_col_size,
663 unsigned int A_internal_rows,
664 unsigned int A_internal_cols,
666 unsigned int B_row_start,
667 unsigned int B_col_start,
668 unsigned int B_row_inc,
669 unsigned int B_col_inc,
670 unsigned int B_row_size,
671 unsigned int B_col_size,
672 unsigned int B_internal_rows,
673 unsigned int B_internal_cols,
676 unsigned int C_row_start,
677 unsigned int C_col_start,
678 unsigned int C_row_inc,
679 unsigned int C_col_inc,
680 unsigned int C_row_size,
681 unsigned int C_col_size,
682 unsigned int C_internal_rows,
683 unsigned int C_internal_cols)
686 __shared__ NumericT bufA[272];
687 __shared__ NumericT bufB[272];
694 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
696 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
697 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
698 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
700 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
701 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
703 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
704 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
709 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
710 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
712 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
713 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
714 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
715 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
716 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
717 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
718 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
719 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
720 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
721 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
722 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
723 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
724 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
725 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
726 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
727 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
728 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
729 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
734 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
735 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
748 template<
typename NumericT>
752 unsigned int A_row_start,
753 unsigned int A_col_start,
754 unsigned int A_row_inc,
755 unsigned int A_col_inc,
756 unsigned int A_row_size,
757 unsigned int A_col_size,
758 unsigned int A_internal_rows,
759 unsigned int A_internal_cols,
761 unsigned int B_row_start,
762 unsigned int B_col_start,
763 unsigned int B_row_inc,
764 unsigned int B_col_inc,
765 unsigned int B_row_size,
766 unsigned int B_col_size,
767 unsigned int B_internal_rows,
768 unsigned int B_internal_cols,
771 unsigned int C_row_start,
772 unsigned int C_col_start,
773 unsigned int C_row_inc,
774 unsigned int C_col_inc,
775 unsigned int C_row_size,
776 unsigned int C_col_size,
777 unsigned int C_internal_rows,
778 unsigned int C_internal_cols)
781 __shared__ NumericT bufA[272];
782 __shared__ NumericT bufB[272];
789 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
790 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
791 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
792 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
793 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
795 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
796 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
798 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
799 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
804 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
805 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
807 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
808 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
809 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
810 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
811 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
812 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
813 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
814 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
815 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
816 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
817 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
818 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
819 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
820 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
821 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
822 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
823 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
824 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
829 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
830 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
835 template<
typename NumericT>
839 unsigned int A_row_start,
840 unsigned int A_col_start,
841 unsigned int A_row_inc,
842 unsigned int A_col_inc,
843 unsigned int A_row_size,
844 unsigned int A_col_size,
845 unsigned int A_internal_rows,
846 unsigned int A_internal_cols,
848 unsigned int B_row_start,
849 unsigned int B_col_start,
850 unsigned int B_row_inc,
851 unsigned int B_col_inc,
852 unsigned int B_row_size,
853 unsigned int B_col_size,
854 unsigned int B_internal_rows,
855 unsigned int B_internal_cols,
858 unsigned int C_row_start,
859 unsigned int C_col_start,
860 unsigned int C_row_inc,
861 unsigned int C_col_inc,
862 unsigned int C_row_size,
863 unsigned int C_col_size,
864 unsigned int C_internal_rows,
865 unsigned int C_internal_cols)
868 __shared__ NumericT bufA[272];
869 __shared__ NumericT bufB[272];
876 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
877 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
878 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
880 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
882 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
883 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
885 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
886 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
891 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
892 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
894 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
895 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
896 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
897 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
898 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
899 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
900 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
901 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
902 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
903 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
904 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
905 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
906 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
907 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
908 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
909 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
910 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
911 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
916 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
917 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
922 template<
typename NumericT>
926 unsigned int A_row_start,
927 unsigned int A_col_start,
928 unsigned int A_row_inc,
929 unsigned int A_col_inc,
930 unsigned int A_row_size,
931 unsigned int A_col_size,
932 unsigned int A_internal_rows,
933 unsigned int A_internal_cols,
935 unsigned int B_row_start,
936 unsigned int B_col_start,
937 unsigned int B_row_inc,
938 unsigned int B_col_inc,
939 unsigned int B_row_size,
940 unsigned int B_col_size,
941 unsigned int B_internal_rows,
942 unsigned int B_internal_cols,
945 unsigned int C_row_start,
946 unsigned int C_col_start,
947 unsigned int C_row_inc,
948 unsigned int C_col_inc,
949 unsigned int C_row_size,
950 unsigned int C_col_size,
951 unsigned int C_internal_rows,
952 unsigned int C_internal_cols)
955 __shared__ NumericT bufA[272];
956 __shared__ NumericT bufB[272];
963 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
965 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
966 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
967 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
969 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
970 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
972 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
973 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
978 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
979 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
981 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
982 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
983 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
984 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
985 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
986 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
987 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
988 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
989 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
990 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
991 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
992 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
993 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
994 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
995 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
996 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
997 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
998 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1003 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1004 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1009 template<
typename NumericT>
1013 unsigned int A_row_start,
1014 unsigned int A_col_start,
1015 unsigned int A_row_inc,
1016 unsigned int A_col_inc,
1017 unsigned int A_row_size,
1018 unsigned int A_col_size,
1019 unsigned int A_internal_rows,
1020 unsigned int A_internal_cols,
1022 unsigned int B_row_start,
1023 unsigned int B_col_start,
1024 unsigned int B_row_inc,
1025 unsigned int B_col_inc,
1026 unsigned int B_row_size,
1027 unsigned int B_col_size,
1028 unsigned int B_internal_rows,
1029 unsigned int B_internal_cols,
1032 unsigned int C_row_start,
1033 unsigned int C_col_start,
1034 unsigned int C_row_inc,
1035 unsigned int C_col_inc,
1036 unsigned int C_row_size,
1037 unsigned int C_col_size,
1038 unsigned int C_internal_rows,
1039 unsigned int C_internal_cols)
1042 __shared__ NumericT bufA[272];
1043 __shared__ NumericT bufB[272];
1050 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1052 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1054 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1056 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1057 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1059 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1060 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1065 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1066 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1068 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1069 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1070 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1071 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1072 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1073 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1074 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1075 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1076 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1077 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1078 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1079 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1080 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1081 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1082 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1083 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1084 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1085 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1090 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1091 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1103 template<
typename NumericT>
1107 unsigned int A_row_start,
1108 unsigned int A_col_start,
1109 unsigned int A_row_inc,
1110 unsigned int A_col_inc,
1111 unsigned int A_row_size,
1112 unsigned int A_col_size,
1113 unsigned int A_internal_rows,
1114 unsigned int A_internal_cols,
1116 unsigned int B_row_start,
1117 unsigned int B_col_start,
1118 unsigned int B_row_inc,
1119 unsigned int B_col_inc,
1120 unsigned int B_row_size,
1121 unsigned int B_col_size,
1122 unsigned int B_internal_rows,
1123 unsigned int B_internal_cols,
1126 unsigned int C_row_start,
1127 unsigned int C_col_start,
1128 unsigned int C_row_inc,
1129 unsigned int C_col_inc,
1130 unsigned int C_row_size,
1131 unsigned int C_col_size,
1132 unsigned int C_internal_rows,
1133 unsigned int C_internal_cols)
1136 __shared__ NumericT bufA[272];
1137 __shared__ NumericT bufB[272];
1144 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1145 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1146 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1147 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1148 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1150 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1151 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1153 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1154 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1159 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1160 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1162 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1163 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1164 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1165 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1166 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1167 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1168 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1169 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1170 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1171 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1172 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1173 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1174 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1175 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1176 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1177 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1178 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1179 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1184 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1185 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1190 template<
typename NumericT>
1194 unsigned int A_row_start,
1195 unsigned int A_col_start,
1196 unsigned int A_row_inc,
1197 unsigned int A_col_inc,
1198 unsigned int A_row_size,
1199 unsigned int A_col_size,
1200 unsigned int A_internal_rows,
1201 unsigned int A_internal_cols,
1203 unsigned int B_row_start,
1204 unsigned int B_col_start,
1205 unsigned int B_row_inc,
1206 unsigned int B_col_inc,
1207 unsigned int B_row_size,
1208 unsigned int B_col_size,
1209 unsigned int B_internal_rows,
1210 unsigned int B_internal_cols,
1213 unsigned int C_row_start,
1214 unsigned int C_col_start,
1215 unsigned int C_row_inc,
1216 unsigned int C_col_inc,
1217 unsigned int C_row_size,
1218 unsigned int C_col_size,
1219 unsigned int C_internal_rows,
1220 unsigned int C_internal_cols)
1223 __shared__ NumericT bufA[272];
1224 __shared__ NumericT bufB[272];
1231 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1232 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1233 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1235 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1237 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1238 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1240 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1241 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1246 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1247 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1249 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1250 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1251 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1254 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1255 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1256 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1257 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1258 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1259 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1260 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1261 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1262 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1263 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1264 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1265 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1266 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1271 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1272 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1277 template<
typename NumericT>
1281 unsigned int A_row_start,
1282 unsigned int A_col_start,
1283 unsigned int A_row_inc,
1284 unsigned int A_col_inc,
1285 unsigned int A_row_size,
1286 unsigned int A_col_size,
1287 unsigned int A_internal_rows,
1288 unsigned int A_internal_cols,
1290 unsigned int B_row_start,
1291 unsigned int B_col_start,
1292 unsigned int B_row_inc,
1293 unsigned int B_col_inc,
1294 unsigned int B_row_size,
1295 unsigned int B_col_size,
1296 unsigned int B_internal_rows,
1297 unsigned int B_internal_cols,
1300 unsigned int C_row_start,
1301 unsigned int C_col_start,
1302 unsigned int C_row_inc,
1303 unsigned int C_col_inc,
1304 unsigned int C_row_size,
1305 unsigned int C_col_size,
1306 unsigned int C_internal_rows,
1307 unsigned int C_internal_cols)
1310 __shared__ NumericT bufA[272];
1311 __shared__ NumericT bufB[272];
1318 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1320 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1321 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1322 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1324 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1325 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1327 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1328 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1333 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1334 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1336 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1337 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1338 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1341 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1342 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1343 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1344 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1345 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1346 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1347 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1348 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1349 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1350 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1351 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1352 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1353 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1358 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1359 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1364 template<
typename NumericT>
1368 unsigned int A_row_start,
1369 unsigned int A_col_start,
1370 unsigned int A_row_inc,
1371 unsigned int A_col_inc,
1372 unsigned int A_row_size,
1373 unsigned int A_col_size,
1374 unsigned int A_internal_rows,
1375 unsigned int A_internal_cols,
1377 unsigned int B_row_start,
1378 unsigned int B_col_start,
1379 unsigned int B_row_inc,
1380 unsigned int B_col_inc,
1381 unsigned int B_row_size,
1382 unsigned int B_col_size,
1383 unsigned int B_internal_rows,
1384 unsigned int B_internal_cols,
1387 unsigned int C_row_start,
1388 unsigned int C_col_start,
1389 unsigned int C_row_inc,
1390 unsigned int C_col_inc,
1391 unsigned int C_row_size,
1392 unsigned int C_col_size,
1393 unsigned int C_internal_rows,
1394 unsigned int C_internal_cols)
1397 __shared__ NumericT bufA[272];
1398 __shared__ NumericT bufB[272];
1405 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1407 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1409 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1411 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1412 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1414 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1415 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1420 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1421 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1423 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1424 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1425 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1428 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1429 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1430 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1431 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1432 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1433 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1434 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1435 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1436 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1437 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1438 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1439 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1440 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1445 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1446 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1462 template<
typename NumericT>
1466 unsigned int A_row_start,
1467 unsigned int A_col_start,
1468 unsigned int A_row_inc,
1469 unsigned int A_col_inc,
1470 unsigned int A_row_size,
1471 unsigned int A_col_size,
1472 unsigned int A_internal_rows,
1473 unsigned int A_internal_cols,
1475 unsigned int B_row_start,
1476 unsigned int B_col_start,
1477 unsigned int B_row_inc,
1478 unsigned int B_col_inc,
1479 unsigned int B_row_size,
1480 unsigned int B_col_size,
1481 unsigned int B_internal_rows,
1482 unsigned int B_internal_cols,
1485 unsigned int C_row_start,
1486 unsigned int C_col_start,
1487 unsigned int C_row_inc,
1488 unsigned int C_col_inc,
1489 unsigned int C_row_size,
1490 unsigned int C_col_size,
1491 unsigned int C_internal_rows,
1492 unsigned int C_internal_cols)
1495 __shared__ NumericT bufA[272];
1496 __shared__ NumericT bufB[272];
1503 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1505 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1507 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1509 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1510 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1512 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1513 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1518 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1519 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1521 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1522 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1523 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1524 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1525 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1526 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1527 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1528 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1529 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1530 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1531 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1532 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1533 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1534 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1535 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1536 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1537 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1538 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1543 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1544 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1549 template<
typename NumericT>
1553 unsigned int A_row_start,
1554 unsigned int A_col_start,
1555 unsigned int A_row_inc,
1556 unsigned int A_col_inc,
1557 unsigned int A_row_size,
1558 unsigned int A_col_size,
1559 unsigned int A_internal_rows,
1560 unsigned int A_internal_cols,
1562 unsigned int B_row_start,
1563 unsigned int B_col_start,
1564 unsigned int B_row_inc,
1565 unsigned int B_col_inc,
1566 unsigned int B_row_size,
1567 unsigned int B_col_size,
1568 unsigned int B_internal_rows,
1569 unsigned int B_internal_cols,
1572 unsigned int C_row_start,
1573 unsigned int C_col_start,
1574 unsigned int C_row_inc,
1575 unsigned int C_col_inc,
1576 unsigned int C_row_size,
1577 unsigned int C_col_size,
1578 unsigned int C_internal_rows,
1579 unsigned int C_internal_cols)
1582 __shared__ NumericT bufA[272];
1583 __shared__ NumericT bufB[272];
1590 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1592 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1593 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1594 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1596 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1597 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1599 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1600 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1605 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1606 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1608 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1609 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1610 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1611 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1612 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1613 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1614 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1615 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1616 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1617 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1618 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1619 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1620 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1621 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1622 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1623 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1624 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1625 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1630 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1631 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1636 template<
typename NumericT>
1640 unsigned int A_row_start,
1641 unsigned int A_col_start,
1642 unsigned int A_row_inc,
1643 unsigned int A_col_inc,
1644 unsigned int A_row_size,
1645 unsigned int A_col_size,
1646 unsigned int A_internal_rows,
1647 unsigned int A_internal_cols,
1649 unsigned int B_row_start,
1650 unsigned int B_col_start,
1651 unsigned int B_row_inc,
1652 unsigned int B_col_inc,
1653 unsigned int B_row_size,
1654 unsigned int B_col_size,
1655 unsigned int B_internal_rows,
1656 unsigned int B_internal_cols,
1659 unsigned int C_row_start,
1660 unsigned int C_col_start,
1661 unsigned int C_row_inc,
1662 unsigned int C_col_inc,
1663 unsigned int C_row_size,
1664 unsigned int C_col_size,
1665 unsigned int C_internal_rows,
1666 unsigned int C_internal_cols)
1669 __shared__ NumericT bufA[272];
1670 __shared__ NumericT bufB[272];
1677 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1678 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1679 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1681 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1683 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1684 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1686 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1687 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1692 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1693 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1695 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1696 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1697 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1698 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1699 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1700 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1701 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1702 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1703 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1704 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1705 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1706 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1707 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1708 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1709 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1710 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1711 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1712 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1717 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1718 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1723 template<
typename NumericT>
1727 unsigned int A_row_start,
1728 unsigned int A_col_start,
1729 unsigned int A_row_inc,
1730 unsigned int A_col_inc,
1731 unsigned int A_row_size,
1732 unsigned int A_col_size,
1733 unsigned int A_internal_rows,
1734 unsigned int A_internal_cols,
1736 unsigned int B_row_start,
1737 unsigned int B_col_start,
1738 unsigned int B_row_inc,
1739 unsigned int B_col_inc,
1740 unsigned int B_row_size,
1741 unsigned int B_col_size,
1742 unsigned int B_internal_rows,
1743 unsigned int B_internal_cols,
1746 unsigned int C_row_start,
1747 unsigned int C_col_start,
1748 unsigned int C_row_inc,
1749 unsigned int C_col_inc,
1750 unsigned int C_row_size,
1751 unsigned int C_col_size,
1752 unsigned int C_internal_rows,
1753 unsigned int C_internal_cols)
1756 __shared__ NumericT bufA[272];
1757 __shared__ NumericT bufB[272];
1764 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1765 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1766 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1767 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1768 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1770 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1771 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1773 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1774 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1779 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1780 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1782 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1783 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1784 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1785 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1786 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1787 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1788 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1789 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1790 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1791 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1792 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1793 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1794 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1795 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1796 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1797 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1798 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1799 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1804 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1805 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1818 template<
typename NumericT>
1822 unsigned int A_row_start,
1823 unsigned int A_col_start,
1824 unsigned int A_row_inc,
1825 unsigned int A_col_inc,
1826 unsigned int A_row_size,
1827 unsigned int A_col_size,
1828 unsigned int A_internal_rows,
1829 unsigned int A_internal_cols,
1831 unsigned int B_row_start,
1832 unsigned int B_col_start,
1833 unsigned int B_row_inc,
1834 unsigned int B_col_inc,
1835 unsigned int B_row_size,
1836 unsigned int B_col_size,
1837 unsigned int B_internal_rows,
1838 unsigned int B_internal_cols,
1841 unsigned int C_row_start,
1842 unsigned int C_col_start,
1843 unsigned int C_row_inc,
1844 unsigned int C_col_inc,
1845 unsigned int C_row_size,
1846 unsigned int C_col_size,
1847 unsigned int C_internal_rows,
1848 unsigned int C_internal_cols)
1851 __shared__ NumericT bufA[272];
1852 __shared__ NumericT bufB[272];
1859 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1861 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1863 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1865 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1866 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1868 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1869 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1874 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1875 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1877 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1878 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1879 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1880 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1881 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1882 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1883 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1884 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1885 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1886 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1887 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1888 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1889 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1890 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1891 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1892 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1893 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1894 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1899 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1900 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1905 template<
typename NumericT>
1909 unsigned int A_row_start,
1910 unsigned int A_col_start,
1911 unsigned int A_row_inc,
1912 unsigned int A_col_inc,
1913 unsigned int A_row_size,
1914 unsigned int A_col_size,
1915 unsigned int A_internal_rows,
1916 unsigned int A_internal_cols,
1918 unsigned int B_row_start,
1919 unsigned int B_col_start,
1920 unsigned int B_row_inc,
1921 unsigned int B_col_inc,
1922 unsigned int B_row_size,
1923 unsigned int B_col_size,
1924 unsigned int B_internal_rows,
1925 unsigned int B_internal_cols,
1928 unsigned int C_row_start,
1929 unsigned int C_col_start,
1930 unsigned int C_row_inc,
1931 unsigned int C_col_inc,
1932 unsigned int C_row_size,
1933 unsigned int C_col_size,
1934 unsigned int C_internal_rows,
1935 unsigned int C_internal_cols)
1938 __shared__ NumericT bufA[272];
1939 __shared__ NumericT bufB[272];
1946 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1948 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1949 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1950 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1952 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1953 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1955 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1956 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1961 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1962 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1964 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1965 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1966 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1967 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1968 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1969 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1970 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1971 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1972 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1973 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1974 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1975 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1976 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1977 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1978 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1979 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1980 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1981 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1986 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1987 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1992 template<
typename NumericT>
1996 unsigned int A_row_start,
1997 unsigned int A_col_start,
1998 unsigned int A_row_inc,
1999 unsigned int A_col_inc,
2000 unsigned int A_row_size,
2001 unsigned int A_col_size,
2002 unsigned int A_internal_rows,
2003 unsigned int A_internal_cols,
2005 unsigned int B_row_start,
2006 unsigned int B_col_start,
2007 unsigned int B_row_inc,
2008 unsigned int B_col_inc,
2009 unsigned int B_row_size,
2010 unsigned int B_col_size,
2011 unsigned int B_internal_rows,
2012 unsigned int B_internal_cols,
2015 unsigned int C_row_start,
2016 unsigned int C_col_start,
2017 unsigned int C_row_inc,
2018 unsigned int C_col_inc,
2019 unsigned int C_row_size,
2020 unsigned int C_col_size,
2021 unsigned int C_internal_rows,
2022 unsigned int C_internal_cols)
2025 __shared__ NumericT bufA[272];
2026 __shared__ NumericT bufB[272];
2033 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2034 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2035 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
2037 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2039 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2040 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2042 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2043 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2048 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2049 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2051 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2052 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2053 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2054 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2055 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2056 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2057 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2058 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2059 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2060 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2061 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2062 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2063 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2064 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2065 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2066 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2067 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2068 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2073 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2074 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2079 template<
typename NumericT>
2083 unsigned int A_row_start,
2084 unsigned int A_col_start,
2085 unsigned int A_row_inc,
2086 unsigned int A_col_inc,
2087 unsigned int A_row_size,
2088 unsigned int A_col_size,
2089 unsigned int A_internal_rows,
2090 unsigned int A_internal_cols,
2092 unsigned int B_row_start,
2093 unsigned int B_col_start,
2094 unsigned int B_row_inc,
2095 unsigned int B_col_inc,
2096 unsigned int B_row_size,
2097 unsigned int B_col_size,
2098 unsigned int B_internal_rows,
2099 unsigned int B_internal_cols,
2102 unsigned int C_row_start,
2103 unsigned int C_col_start,
2104 unsigned int C_row_inc,
2105 unsigned int C_col_inc,
2106 unsigned int C_row_size,
2107 unsigned int C_col_size,
2108 unsigned int C_internal_rows,
2109 unsigned int C_internal_cols)
2112 __shared__ NumericT bufA[272];
2113 __shared__ NumericT bufB[272];
2120 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2121 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2122 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
2123 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
2124 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2126 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2127 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2129 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2130 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2135 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2136 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2138 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2139 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2140 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2141 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2142 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2143 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2144 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2145 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2146 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2147 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2148 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2149 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2150 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2151 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2152 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2153 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2154 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2155 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2160 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2161 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2177 template<
typename NumericT>
2181 unsigned int A_row_start,
2182 unsigned int A_col_start,
2183 unsigned int A_row_inc,
2184 unsigned int A_col_inc,
2185 unsigned int A_row_size,
2186 unsigned int A_col_size,
2187 unsigned int A_internal_rows,
2188 unsigned int A_internal_cols,
2190 unsigned int B_row_start,
2191 unsigned int B_col_start,
2192 unsigned int B_row_inc,
2193 unsigned int B_col_inc,
2194 unsigned int B_row_size,
2195 unsigned int B_col_size,
2196 unsigned int B_internal_rows,
2197 unsigned int B_internal_cols,
2200 unsigned int C_row_start,
2201 unsigned int C_col_start,
2202 unsigned int C_row_inc,
2203 unsigned int C_col_inc,
2204 unsigned int C_row_size,
2205 unsigned int C_col_size,
2206 unsigned int C_internal_rows,
2207 unsigned int C_internal_cols)
2210 __shared__ NumericT bufA[272];
2211 __shared__ NumericT bufB[272];
2218 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2220 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2221 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2222 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2224 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2225 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2227 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2228 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2233 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2234 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2236 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2237 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2238 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2239 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2240 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2241 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2242 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2243 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2244 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2245 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2246 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2247 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2248 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2249 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2250 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2251 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2258 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2259 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2264 template<
typename NumericT>
2268 unsigned int A_row_start,
2269 unsigned int A_col_start,
2270 unsigned int A_row_inc,
2271 unsigned int A_col_inc,
2272 unsigned int A_row_size,
2273 unsigned int A_col_size,
2274 unsigned int A_internal_rows,
2275 unsigned int A_internal_cols,
2277 unsigned int B_row_start,
2278 unsigned int B_col_start,
2279 unsigned int B_row_inc,
2280 unsigned int B_col_inc,
2281 unsigned int B_row_size,
2282 unsigned int B_col_size,
2283 unsigned int B_internal_rows,
2284 unsigned int B_internal_cols,
2287 unsigned int C_row_start,
2288 unsigned int C_col_start,
2289 unsigned int C_row_inc,
2290 unsigned int C_col_inc,
2291 unsigned int C_row_size,
2292 unsigned int C_col_size,
2293 unsigned int C_internal_rows,
2294 unsigned int C_internal_cols)
2297 __shared__ NumericT bufA[272];
2298 __shared__ NumericT bufB[272];
2305 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2307 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2309 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2311 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2312 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2314 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2315 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2320 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2321 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2323 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2324 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2325 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2326 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2327 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2328 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2329 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2330 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2331 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2332 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2333 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2334 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2335 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2336 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2337 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2338 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2345 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2346 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2351 template<
typename NumericT>
2355 unsigned int A_row_start,
2356 unsigned int A_col_start,
2357 unsigned int A_row_inc,
2358 unsigned int A_col_inc,
2359 unsigned int A_row_size,
2360 unsigned int A_col_size,
2361 unsigned int A_internal_rows,
2362 unsigned int A_internal_cols,
2364 unsigned int B_row_start,
2365 unsigned int B_col_start,
2366 unsigned int B_row_inc,
2367 unsigned int B_col_inc,
2368 unsigned int B_row_size,
2369 unsigned int B_col_size,
2370 unsigned int B_internal_rows,
2371 unsigned int B_internal_cols,
2374 unsigned int C_row_start,
2375 unsigned int C_col_start,
2376 unsigned int C_row_inc,
2377 unsigned int C_col_inc,
2378 unsigned int C_row_size,
2379 unsigned int C_col_size,
2380 unsigned int C_internal_rows,
2381 unsigned int C_internal_cols)
2384 __shared__ NumericT bufA[272];
2385 __shared__ NumericT bufB[272];
2392 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2393 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2394 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2395 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2396 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2398 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2399 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2401 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2402 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2407 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2408 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2410 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2411 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2412 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2413 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2414 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2415 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2416 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2417 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2418 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2419 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2420 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2421 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2422 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2423 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2424 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2425 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2432 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2433 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2438 template<
typename NumericT>
2442 unsigned int A_row_start,
2443 unsigned int A_col_start,
2444 unsigned int A_row_inc,
2445 unsigned int A_col_inc,
2446 unsigned int A_row_size,
2447 unsigned int A_col_size,
2448 unsigned int A_internal_rows,
2449 unsigned int A_internal_cols,
2451 unsigned int B_row_start,
2452 unsigned int B_col_start,
2453 unsigned int B_row_inc,
2454 unsigned int B_col_inc,
2455 unsigned int B_row_size,
2456 unsigned int B_col_size,
2457 unsigned int B_internal_rows,
2458 unsigned int B_internal_cols,
2461 unsigned int C_row_start,
2462 unsigned int C_col_start,
2463 unsigned int C_row_inc,
2464 unsigned int C_col_inc,
2465 unsigned int C_row_size,
2466 unsigned int C_col_size,
2467 unsigned int C_internal_rows,
2468 unsigned int C_internal_cols)
2471 __shared__ NumericT bufA[272];
2472 __shared__ NumericT bufB[272];
2479 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2480 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2481 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2483 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2485 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2486 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2488 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2489 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2494 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2495 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2497 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2498 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2499 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2500 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2501 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2502 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2503 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2504 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2505 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2506 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2507 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2508 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2509 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2510 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2511 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2512 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2513 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2514 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2519 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2520 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2534 template<
typename NumericT>
2538 unsigned int A_row_start,
2539 unsigned int A_col_start,
2540 unsigned int A_row_inc,
2541 unsigned int A_col_inc,
2542 unsigned int A_row_size,
2543 unsigned int A_col_size,
2544 unsigned int A_internal_rows,
2545 unsigned int A_internal_cols,
2547 unsigned int B_row_start,
2548 unsigned int B_col_start,
2549 unsigned int B_row_inc,
2550 unsigned int B_col_inc,
2551 unsigned int B_row_size,
2552 unsigned int B_col_size,
2553 unsigned int B_internal_rows,
2554 unsigned int B_internal_cols,
2557 unsigned int C_row_start,
2558 unsigned int C_col_start,
2559 unsigned int C_row_inc,
2560 unsigned int C_col_inc,
2561 unsigned int C_row_size,
2562 unsigned int C_col_size,
2563 unsigned int C_internal_rows,
2564 unsigned int C_internal_cols)
2567 __shared__ NumericT bufA[272];
2568 __shared__ NumericT bufB[272];
2575 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2577 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2578 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2579 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2581 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2582 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2584 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2585 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2590 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2591 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2593 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2594 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2595 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2596 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2597 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2598 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2599 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2600 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2601 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2602 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2603 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2604 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2605 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2606 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2607 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2608 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2609 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2610 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2615 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2616 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2621 template<
typename NumericT>
2625 unsigned int A_row_start,
2626 unsigned int A_col_start,
2627 unsigned int A_row_inc,
2628 unsigned int A_col_inc,
2629 unsigned int A_row_size,
2630 unsigned int A_col_size,
2631 unsigned int A_internal_rows,
2632 unsigned int A_internal_cols,
2634 unsigned int B_row_start,
2635 unsigned int B_col_start,
2636 unsigned int B_row_inc,
2637 unsigned int B_col_inc,
2638 unsigned int B_row_size,
2639 unsigned int B_col_size,
2640 unsigned int B_internal_rows,
2641 unsigned int B_internal_cols,
2644 unsigned int C_row_start,
2645 unsigned int C_col_start,
2646 unsigned int C_row_inc,
2647 unsigned int C_col_inc,
2648 unsigned int C_row_size,
2649 unsigned int C_col_size,
2650 unsigned int C_internal_rows,
2651 unsigned int C_internal_cols)
2654 __shared__ NumericT bufA[272];
2655 __shared__ NumericT bufB[272];
2662 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2664 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2666 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2668 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2669 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2671 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2672 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2677 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2678 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2680 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2681 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2682 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2683 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2684 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2685 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2686 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2687 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2688 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2689 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2690 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2691 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2692 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2693 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2694 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2695 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2696 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2697 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2702 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2703 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2708 template<
typename NumericT>
2712 unsigned int A_row_start,
2713 unsigned int A_col_start,
2714 unsigned int A_row_inc,
2715 unsigned int A_col_inc,
2716 unsigned int A_row_size,
2717 unsigned int A_col_size,
2718 unsigned int A_internal_rows,
2719 unsigned int A_internal_cols,
2721 unsigned int B_row_start,
2722 unsigned int B_col_start,
2723 unsigned int B_row_inc,
2724 unsigned int B_col_inc,
2725 unsigned int B_row_size,
2726 unsigned int B_col_size,
2727 unsigned int B_internal_rows,
2728 unsigned int B_internal_cols,
2731 unsigned int C_row_start,
2732 unsigned int C_col_start,
2733 unsigned int C_row_inc,
2734 unsigned int C_col_inc,
2735 unsigned int C_row_size,
2736 unsigned int C_col_size,
2737 unsigned int C_internal_rows,
2738 unsigned int C_internal_cols)
2741 __shared__ NumericT bufA[272];
2742 __shared__ NumericT bufB[272];
2749 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2750 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2751 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2752 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2753 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2755 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2756 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2758 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2759 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2764 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2765 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2767 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2768 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2769 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2770 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2771 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2772 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2773 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2774 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2775 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2776 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2777 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2778 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2779 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2780 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2781 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2782 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2783 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2784 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2789 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2790 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2795 template<
typename NumericT>
2799 unsigned int A_row_start,
2800 unsigned int A_col_start,
2801 unsigned int A_row_inc,
2802 unsigned int A_col_inc,
2803 unsigned int A_row_size,
2804 unsigned int A_col_size,
2805 unsigned int A_internal_rows,
2806 unsigned int A_internal_cols,
2808 unsigned int B_row_start,
2809 unsigned int B_col_start,
2810 unsigned int B_row_inc,
2811 unsigned int B_col_inc,
2812 unsigned int B_row_size,
2813 unsigned int B_col_size,
2814 unsigned int B_internal_rows,
2815 unsigned int B_internal_cols,
2818 unsigned int C_row_start,
2819 unsigned int C_col_start,
2820 unsigned int C_row_inc,
2821 unsigned int C_col_inc,
2822 unsigned int C_row_size,
2823 unsigned int C_col_size,
2824 unsigned int C_internal_rows,
2825 unsigned int C_internal_cols)
2828 __shared__ NumericT bufA[272];
2829 __shared__ NumericT bufB[272];
2836 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2837 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2838 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2840 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2842 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2843 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2845 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2846 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2851 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2852 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2854 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2855 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2856 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2857 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2858 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2859 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2860 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2861 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2862 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2863 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2864 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2865 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2866 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2867 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2868 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2869 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2870 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2871 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2876 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2877 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)