ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
matrix_operations_prod.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace cuda
33 {
34 
35 // matrix-matrix multiplication C = A * B
36 // matrix layouts: C...col_major, A...col_major, B...col_major
37 template<typename NumericT>
39  NumericT alpha,
40  const NumericT * A,
41  unsigned int A_row_start,
42  unsigned int A_col_start,
43  unsigned int A_row_inc,
44  unsigned int A_col_inc,
45  unsigned int A_row_size,
46  unsigned int A_col_size,
47  unsigned int A_internal_rows,
48  unsigned int A_internal_cols,
49  const NumericT * B,
50  unsigned int B_row_start,
51  unsigned int B_col_start,
52  unsigned int B_row_inc,
53  unsigned int B_col_inc,
54  unsigned int B_row_size,
55  unsigned int B_col_size,
56  unsigned int B_internal_rows,
57  unsigned int B_internal_cols,
58  NumericT beta,
59  NumericT * C,
60  unsigned int C_row_start,
61  unsigned int C_col_start,
62  unsigned int C_row_inc,
63  unsigned int C_col_inc,
64  unsigned int C_row_size,
65  unsigned int C_col_size,
66  unsigned int C_internal_rows,
67  unsigned int C_internal_cols)
68 {
69 
70  __shared__ NumericT bufA[272];
71  __shared__ NumericT bufB[272];
72 
73  vcl_size_t block_size = 16;//get_local_size(0);
74  vcl_size_t row_block_id = blockIdx.x;
75  vcl_size_t col_block_id = blockIdx.y;
76  vcl_size_t row_thread_id = threadIdx.x;
77  vcl_size_t col_thread_id = threadIdx.y;
78  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
79  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
80  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
81  vcl_size_t bStep = block_size * B_row_inc;
82  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
83  NumericT Csub = 0;
84  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
85  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
86 
87  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
88  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
89  for (vcl_size_t block = 0;
90  block < block_num;
91  ++block)
92  {
93  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
94  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
95  __syncthreads();
96  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
97  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
98  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
99  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
100  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
101  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
102  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
103  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
104  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
105  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
106  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
107  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
108  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
109  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
110  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
111  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
112  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
113  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
114  __syncthreads();
115  aBegin += aStep;
116  bBegin += bStep;
117  }
118  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
119  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
120 }
121 
122 // matrix-matrix multiplication C = A * B^T
123 // matrix layouts: C...col_major, A...col_major, B...col_major
124 template<typename NumericT>
126  NumericT alpha,
127  const NumericT * A,
128  unsigned int A_row_start,
129  unsigned int A_col_start,
130  unsigned int A_row_inc,
131  unsigned int A_col_inc,
132  unsigned int A_row_size,
133  unsigned int A_col_size,
134  unsigned int A_internal_rows,
135  unsigned int A_internal_cols,
136  const NumericT * B,
137  unsigned int B_row_start,
138  unsigned int B_col_start,
139  unsigned int B_row_inc,
140  unsigned int B_col_inc,
141  unsigned int B_row_size,
142  unsigned int B_col_size,
143  unsigned int B_internal_rows,
144  unsigned int B_internal_cols,
145  NumericT beta,
146  NumericT * C,
147  unsigned int C_row_start,
148  unsigned int C_col_start,
149  unsigned int C_row_inc,
150  unsigned int C_col_inc,
151  unsigned int C_row_size,
152  unsigned int C_col_size,
153  unsigned int C_internal_rows,
154  unsigned int C_internal_cols)
155 {
156 
157  __shared__ NumericT bufA[272];
158  __shared__ NumericT bufB[272];
159 
160  vcl_size_t block_size = 16;//get_local_size(0);
161  vcl_size_t row_block_id = blockIdx.x;
162  vcl_size_t col_block_id = blockIdx.y;
163  vcl_size_t row_thread_id = threadIdx.x;
164  vcl_size_t col_thread_id = threadIdx.y;
165  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
166  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
167  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
168  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
169  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
170  NumericT Csub = 0;
171  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
172  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
173 
174  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
175  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
176  for (vcl_size_t block = 0;
177  block < block_num;
178  ++block)
179  {
180  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
181  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
182  __syncthreads();
183  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
184  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
185  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
186  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
187  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
188  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
189  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
190  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
191  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
192  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
193  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
194  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
195  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
196  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
197  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
198  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
199  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
200  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
201  __syncthreads();
202  aBegin += aStep;
203  bBegin += bStep;
204  }
205  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
206  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
207 }
208 
209 // matrix-matrix multiplication C = A^T * B
210 // matrix layouts: C...col_major, A...col_major, B...col_major
211 template<typename NumericT>
213  NumericT alpha,
214  const NumericT * A,
215  unsigned int A_row_start,
216  unsigned int A_col_start,
217  unsigned int A_row_inc,
218  unsigned int A_col_inc,
219  unsigned int A_row_size,
220  unsigned int A_col_size,
221  unsigned int A_internal_rows,
222  unsigned int A_internal_cols,
223  const NumericT * B,
224  unsigned int B_row_start,
225  unsigned int B_col_start,
226  unsigned int B_row_inc,
227  unsigned int B_col_inc,
228  unsigned int B_row_size,
229  unsigned int B_col_size,
230  unsigned int B_internal_rows,
231  unsigned int B_internal_cols,
232  NumericT beta,
233  NumericT * C,
234  unsigned int C_row_start,
235  unsigned int C_col_start,
236  unsigned int C_row_inc,
237  unsigned int C_col_inc,
238  unsigned int C_row_size,
239  unsigned int C_col_size,
240  unsigned int C_internal_rows,
241  unsigned int C_internal_cols)
242 {
243 
244  __shared__ NumericT bufA[272];
245  __shared__ NumericT bufB[272];
246 
247  vcl_size_t block_size = 16;//get_local_size(0);
248  vcl_size_t row_block_id = blockIdx.x;
249  vcl_size_t col_block_id = blockIdx.y;
250  vcl_size_t row_thread_id = threadIdx.x;
251  vcl_size_t col_thread_id = threadIdx.y;
252  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
253  vcl_size_t aStep = block_size * A_row_inc;
254  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
255  vcl_size_t bStep = block_size * B_row_inc;
256  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
257  NumericT Csub = 0;
258  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
259  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
260 
261  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
262  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
263  for (vcl_size_t block = 0;
264  block < block_num;
265  ++block)
266  {
267  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
268  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
269  __syncthreads();
270  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
271  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
272  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
273  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
274  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
275  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
276  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
277  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
278  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
279  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
280  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
281  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
282  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
283  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
284  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
285  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
286  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
287  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
288  __syncthreads();
289  aBegin += aStep;
290  bBegin += bStep;
291  }
292  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
293  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
294 }
295 
296 // matrix-matrix multiplication C = A^T * B^T
297 // matrix layouts: C...col_major, A...col_major, B...col_major
298 template<typename NumericT>
300  NumericT alpha,
301  const NumericT * A,
302  unsigned int A_row_start,
303  unsigned int A_col_start,
304  unsigned int A_row_inc,
305  unsigned int A_col_inc,
306  unsigned int A_row_size,
307  unsigned int A_col_size,
308  unsigned int A_internal_rows,
309  unsigned int A_internal_cols,
310  const NumericT * B,
311  unsigned int B_row_start,
312  unsigned int B_col_start,
313  unsigned int B_row_inc,
314  unsigned int B_col_inc,
315  unsigned int B_row_size,
316  unsigned int B_col_size,
317  unsigned int B_internal_rows,
318  unsigned int B_internal_cols,
319  NumericT beta,
320  NumericT * C,
321  unsigned int C_row_start,
322  unsigned int C_col_start,
323  unsigned int C_row_inc,
324  unsigned int C_col_inc,
325  unsigned int C_row_size,
326  unsigned int C_col_size,
327  unsigned int C_internal_rows,
328  unsigned int C_internal_cols)
329 {
330 
331  __shared__ NumericT bufA[272];
332  __shared__ NumericT bufB[272];
333 
334  vcl_size_t block_size = 16;//get_local_size(0);
335  vcl_size_t row_block_id = blockIdx.x;
336  vcl_size_t col_block_id = blockIdx.y;
337  vcl_size_t row_thread_id = threadIdx.x;
338  vcl_size_t col_thread_id = threadIdx.y;
339  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
340  vcl_size_t aStep = block_size * A_row_inc;
341  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
342  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
343  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
344  NumericT Csub = 0;
345  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
346  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
347 
348  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
349  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
350  for (vcl_size_t block = 0;
351  block < block_num;
352  ++block)
353  {
354  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
355  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
356  __syncthreads();
357  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
358  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
359  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
360  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
361  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
362  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
363  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
364  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
365  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
366  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
367  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
368  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
369  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
370  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
371  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
372  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
373  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
374  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
375  __syncthreads();
376  aBegin += aStep;
377  bBegin += bStep;
378  }
379  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
380  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
381 }
382 
383 
384 
386 
387 
388 
389 
390 // matrix-matrix multiplication C = A * B
391 // matrix layouts: C...row_major, A...col_major, B...col_major
392 template<typename NumericT>
394  NumericT alpha,
395  const NumericT * A,
396  unsigned int A_row_start,
397  unsigned int A_col_start,
398  unsigned int A_row_inc,
399  unsigned int A_col_inc,
400  unsigned int A_row_size,
401  unsigned int A_col_size,
402  unsigned int A_internal_rows,
403  unsigned int A_internal_cols,
404  const NumericT * B,
405  unsigned int B_row_start,
406  unsigned int B_col_start,
407  unsigned int B_row_inc,
408  unsigned int B_col_inc,
409  unsigned int B_row_size,
410  unsigned int B_col_size,
411  unsigned int B_internal_rows,
412  unsigned int B_internal_cols,
413  NumericT beta,
414  NumericT * C,
415  unsigned int C_row_start,
416  unsigned int C_col_start,
417  unsigned int C_row_inc,
418  unsigned int C_col_inc,
419  unsigned int C_row_size,
420  unsigned int C_col_size,
421  unsigned int C_internal_rows,
422  unsigned int C_internal_cols)
423 {
424 
425  __shared__ NumericT bufA[272];
426  __shared__ NumericT bufB[272];
427 
428  vcl_size_t block_size = 16;//get_local_size(0);
429  vcl_size_t row_block_id = blockIdx.x;
430  vcl_size_t col_block_id = blockIdx.y;
431  vcl_size_t row_thread_id = threadIdx.x;
432  vcl_size_t col_thread_id = threadIdx.y;
433  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
434  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
435  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
436  vcl_size_t bStep = block_size * B_row_inc;
437  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
438  NumericT Csub = 0;
439  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
440  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
441 
442  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
443  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
444  for (vcl_size_t block = 0;
445  block < block_num;
446  ++block)
447  {
448  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
449  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
450  __syncthreads();
451  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
452  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
453  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
454  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
455  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
456  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
457  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
458  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
459  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
460  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
461  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
462  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
463  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
464  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
465  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
466  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
467  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
468  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
469  __syncthreads();
470  aBegin += aStep;
471  bBegin += bStep;
472  }
473  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
474  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
475 }
476 
477 // matrix-matrix multiplication C = A * B^T
478 // matrix layouts: C...row_major, A...col_major, B...col_major
479 template<typename NumericT>
481  NumericT alpha,
482  const NumericT * A,
483  unsigned int A_row_start,
484  unsigned int A_col_start,
485  unsigned int A_row_inc,
486  unsigned int A_col_inc,
487  unsigned int A_row_size,
488  unsigned int A_col_size,
489  unsigned int A_internal_rows,
490  unsigned int A_internal_cols,
491  const NumericT * B,
492  unsigned int B_row_start,
493  unsigned int B_col_start,
494  unsigned int B_row_inc,
495  unsigned int B_col_inc,
496  unsigned int B_row_size,
497  unsigned int B_col_size,
498  unsigned int B_internal_rows,
499  unsigned int B_internal_cols,
500  NumericT beta,
501  NumericT * C,
502  unsigned int C_row_start,
503  unsigned int C_col_start,
504  unsigned int C_row_inc,
505  unsigned int C_col_inc,
506  unsigned int C_row_size,
507  unsigned int C_col_size,
508  unsigned int C_internal_rows,
509  unsigned int C_internal_cols)
510 {
511 
512  __shared__ NumericT bufA[272];
513  __shared__ NumericT bufB[272];
514 
515  vcl_size_t block_size = 16;//get_local_size(0);
516  vcl_size_t row_block_id = blockIdx.x;
517  vcl_size_t col_block_id = blockIdx.y;
518  vcl_size_t row_thread_id = threadIdx.x;
519  vcl_size_t col_thread_id = threadIdx.y;
520  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
521  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
522  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
523  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
524  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
525  NumericT Csub = 0;
526  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
527  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
528 
529  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
530  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
531  for (vcl_size_t block = 0;
532  block < block_num;
533  ++block)
534  {
535  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
536  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
537  __syncthreads();
538  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
539  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
540  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
541  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
542  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
543  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
544  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
545  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
546  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
547  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
548  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
549  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
550  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
551  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
552  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
553  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
554  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
555  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
556  __syncthreads();
557  aBegin += aStep;
558  bBegin += bStep;
559  }
560  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
561  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
562 }
563 
564 // matrix-matrix multiplication C = A^T * B
565 // matrix layouts: C...row_major, A...col_major, B...col_major
566 template<typename NumericT>
568  NumericT alpha,
569  const NumericT * A,
570  unsigned int A_row_start,
571  unsigned int A_col_start,
572  unsigned int A_row_inc,
573  unsigned int A_col_inc,
574  unsigned int A_row_size,
575  unsigned int A_col_size,
576  unsigned int A_internal_rows,
577  unsigned int A_internal_cols,
578  const NumericT * B,
579  unsigned int B_row_start,
580  unsigned int B_col_start,
581  unsigned int B_row_inc,
582  unsigned int B_col_inc,
583  unsigned int B_row_size,
584  unsigned int B_col_size,
585  unsigned int B_internal_rows,
586  unsigned int B_internal_cols,
587  NumericT beta,
588  NumericT * C,
589  unsigned int C_row_start,
590  unsigned int C_col_start,
591  unsigned int C_row_inc,
592  unsigned int C_col_inc,
593  unsigned int C_row_size,
594  unsigned int C_col_size,
595  unsigned int C_internal_rows,
596  unsigned int C_internal_cols)
597 {
598 
599  __shared__ NumericT bufA[272];
600  __shared__ NumericT bufB[272];
601 
602  vcl_size_t block_size = 16;//get_local_size(0);
603  vcl_size_t row_block_id = blockIdx.x;
604  vcl_size_t col_block_id = blockIdx.y;
605  vcl_size_t row_thread_id = threadIdx.x;
606  vcl_size_t col_thread_id = threadIdx.y;
607  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
608  vcl_size_t aStep = block_size * A_row_inc;
609  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
610  vcl_size_t bStep = block_size * B_row_inc;
611  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
612  NumericT Csub = 0;
613  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
614  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
615 
616  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
617  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
618  for (vcl_size_t block = 0;
619  block < block_num;
620  ++block)
621  {
622  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
623  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
624  __syncthreads();
625  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
626  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
627  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
628  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
629  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
630  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
631  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
632  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
633  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
634  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
635  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
636  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
637  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
638  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
639  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
640  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
641  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
642  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
643  __syncthreads();
644  aBegin += aStep;
645  bBegin += bStep;
646  }
647  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
648  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
649 }
650 
651 // matrix-matrix multiplication C = A^T * B^T
652 // matrix layouts: C...row_major, A...col_major, B...col_major
653 template<typename NumericT>
655  NumericT alpha,
656  const NumericT * A,
657  unsigned int A_row_start,
658  unsigned int A_col_start,
659  unsigned int A_row_inc,
660  unsigned int A_col_inc,
661  unsigned int A_row_size,
662  unsigned int A_col_size,
663  unsigned int A_internal_rows,
664  unsigned int A_internal_cols,
665  const NumericT * B,
666  unsigned int B_row_start,
667  unsigned int B_col_start,
668  unsigned int B_row_inc,
669  unsigned int B_col_inc,
670  unsigned int B_row_size,
671  unsigned int B_col_size,
672  unsigned int B_internal_rows,
673  unsigned int B_internal_cols,
674  NumericT beta,
675  NumericT * C,
676  unsigned int C_row_start,
677  unsigned int C_col_start,
678  unsigned int C_row_inc,
679  unsigned int C_col_inc,
680  unsigned int C_row_size,
681  unsigned int C_col_size,
682  unsigned int C_internal_rows,
683  unsigned int C_internal_cols)
684 {
685 
686  __shared__ NumericT bufA[272];
687  __shared__ NumericT bufB[272];
688 
689  vcl_size_t block_size = 16;//get_local_size(0);
690  vcl_size_t row_block_id = blockIdx.x;
691  vcl_size_t col_block_id = blockIdx.y;
692  vcl_size_t row_thread_id = threadIdx.x;
693  vcl_size_t col_thread_id = threadIdx.y;
694  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
695  vcl_size_t aStep = block_size * A_row_inc;
696  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
697  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
698  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
699  NumericT Csub = 0;
700  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
701  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
702 
703  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
704  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
705  for (vcl_size_t block = 0;
706  block < block_num;
707  ++block)
708  {
709  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
710  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
711  __syncthreads();
712  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
713  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
714  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
715  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
716  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
717  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
718  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
719  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
720  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
721  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
722  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
723  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
724  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
725  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
726  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
727  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
728  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
729  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
730  __syncthreads();
731  aBegin += aStep;
732  bBegin += bStep;
733  }
734  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
735  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
736 }
737 
738 
739 
740 
742 
743 
744 
745 
746 // matrix-matrix multiplication C = A * B
747 // matrix layouts: C...col_major, A...col_major, B...row_major
748 template<typename NumericT>
750  NumericT alpha,
751  const NumericT * A,
752  unsigned int A_row_start,
753  unsigned int A_col_start,
754  unsigned int A_row_inc,
755  unsigned int A_col_inc,
756  unsigned int A_row_size,
757  unsigned int A_col_size,
758  unsigned int A_internal_rows,
759  unsigned int A_internal_cols,
760  const NumericT * B,
761  unsigned int B_row_start,
762  unsigned int B_col_start,
763  unsigned int B_row_inc,
764  unsigned int B_col_inc,
765  unsigned int B_row_size,
766  unsigned int B_col_size,
767  unsigned int B_internal_rows,
768  unsigned int B_internal_cols,
769  NumericT beta,
770  NumericT * C,
771  unsigned int C_row_start,
772  unsigned int C_col_start,
773  unsigned int C_row_inc,
774  unsigned int C_col_inc,
775  unsigned int C_row_size,
776  unsigned int C_col_size,
777  unsigned int C_internal_rows,
778  unsigned int C_internal_cols)
779 {
780 
781  __shared__ NumericT bufA[272];
782  __shared__ NumericT bufB[272];
783 
784  vcl_size_t block_size = 16;//get_local_size(0);
785  vcl_size_t row_block_id = blockIdx.x;
786  vcl_size_t col_block_id = blockIdx.y;
787  vcl_size_t row_thread_id = threadIdx.x;
788  vcl_size_t col_thread_id = threadIdx.y;
789  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
790  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
791  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
792  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
793  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
794  NumericT Csub = 0;
795  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
796  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
797 
798  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
799  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
800  for (vcl_size_t block = 0;
801  block < block_num;
802  ++block)
803  {
804  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
805  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
806  __syncthreads();
807  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
808  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
809  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
810  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
811  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
812  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
813  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
814  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
815  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
816  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
817  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
818  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
819  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
820  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
821  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
822  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
823  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
824  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
825  __syncthreads();
826  aBegin += aStep;
827  bBegin += bStep;
828  }
829  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
830  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
831 }
832 
833 // matrix-matrix multiplication C = A * B^T
834 // matrix layouts: C...col_major, A...col_major, B...row_major
835 template<typename NumericT>
837  NumericT alpha,
838  const NumericT * A,
839  unsigned int A_row_start,
840  unsigned int A_col_start,
841  unsigned int A_row_inc,
842  unsigned int A_col_inc,
843  unsigned int A_row_size,
844  unsigned int A_col_size,
845  unsigned int A_internal_rows,
846  unsigned int A_internal_cols,
847  const NumericT * B,
848  unsigned int B_row_start,
849  unsigned int B_col_start,
850  unsigned int B_row_inc,
851  unsigned int B_col_inc,
852  unsigned int B_row_size,
853  unsigned int B_col_size,
854  unsigned int B_internal_rows,
855  unsigned int B_internal_cols,
856  NumericT beta,
857  NumericT * C,
858  unsigned int C_row_start,
859  unsigned int C_col_start,
860  unsigned int C_row_inc,
861  unsigned int C_col_inc,
862  unsigned int C_row_size,
863  unsigned int C_col_size,
864  unsigned int C_internal_rows,
865  unsigned int C_internal_cols)
866 {
867 
868  __shared__ NumericT bufA[272];
869  __shared__ NumericT bufB[272];
870 
871  vcl_size_t block_size = 16;//get_local_size(0);
872  vcl_size_t row_block_id = blockIdx.x;
873  vcl_size_t col_block_id = blockIdx.y;
874  vcl_size_t row_thread_id = threadIdx.x;
875  vcl_size_t col_thread_id = threadIdx.y;
876  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
877  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
878  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
879  vcl_size_t bStep = block_size * B_col_inc;
880  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
881  NumericT Csub = 0;
882  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
883  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
884 
885  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
886  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
887  for (vcl_size_t block = 0;
888  block < block_num;
889  ++block)
890  {
891  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
892  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
893  __syncthreads();
894  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
895  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
896  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
897  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
898  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
899  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
900  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
901  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
902  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
903  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
904  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
905  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
906  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
907  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
908  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
909  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
910  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
911  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
912  __syncthreads();
913  aBegin += aStep;
914  bBegin += bStep;
915  }
916  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
917  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
918 }
919 
920 // matrix-matrix multiplication C = A^T * B
921 // matrix layouts: C...col_major, A...col_major, B...row_major
922 template<typename NumericT>
924  NumericT alpha,
925  const NumericT * A,
926  unsigned int A_row_start,
927  unsigned int A_col_start,
928  unsigned int A_row_inc,
929  unsigned int A_col_inc,
930  unsigned int A_row_size,
931  unsigned int A_col_size,
932  unsigned int A_internal_rows,
933  unsigned int A_internal_cols,
934  const NumericT * B,
935  unsigned int B_row_start,
936  unsigned int B_col_start,
937  unsigned int B_row_inc,
938  unsigned int B_col_inc,
939  unsigned int B_row_size,
940  unsigned int B_col_size,
941  unsigned int B_internal_rows,
942  unsigned int B_internal_cols,
943  NumericT beta,
944  NumericT * C,
945  unsigned int C_row_start,
946  unsigned int C_col_start,
947  unsigned int C_row_inc,
948  unsigned int C_col_inc,
949  unsigned int C_row_size,
950  unsigned int C_col_size,
951  unsigned int C_internal_rows,
952  unsigned int C_internal_cols)
953 {
954 
955  __shared__ NumericT bufA[272];
956  __shared__ NumericT bufB[272];
957 
958  vcl_size_t block_size = 16;//get_local_size(0);
959  vcl_size_t row_block_id = blockIdx.x;
960  vcl_size_t col_block_id = blockIdx.y;
961  vcl_size_t row_thread_id = threadIdx.x;
962  vcl_size_t col_thread_id = threadIdx.y;
963  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
964  vcl_size_t aStep = block_size * A_row_inc;
965  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
966  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
967  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
968  NumericT Csub = 0;
969  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
970  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
971 
972  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
973  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
974  for (vcl_size_t block = 0;
975  block < block_num;
976  ++block)
977  {
978  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
979  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
980  __syncthreads();
981  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
982  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
983  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
984  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
985  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
986  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
987  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
988  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
989  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
990  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
991  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
992  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
993  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
994  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
995  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
996  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
997  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
998  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
999  __syncthreads();
1000  aBegin += aStep;
1001  bBegin += bStep;
1002  }
1003  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1004  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1005 }
1006 
1007 // matrix-matrix multiplication C = A^T * B^T
1008 // matrix layouts: C...col_major, A...col_major, B...row_major
1009 template<typename NumericT>
1011  NumericT alpha,
1012  const NumericT * A,
1013  unsigned int A_row_start,
1014  unsigned int A_col_start,
1015  unsigned int A_row_inc,
1016  unsigned int A_col_inc,
1017  unsigned int A_row_size,
1018  unsigned int A_col_size,
1019  unsigned int A_internal_rows,
1020  unsigned int A_internal_cols,
1021  const NumericT * B,
1022  unsigned int B_row_start,
1023  unsigned int B_col_start,
1024  unsigned int B_row_inc,
1025  unsigned int B_col_inc,
1026  unsigned int B_row_size,
1027  unsigned int B_col_size,
1028  unsigned int B_internal_rows,
1029  unsigned int B_internal_cols,
1030  NumericT beta,
1031  NumericT * C,
1032  unsigned int C_row_start,
1033  unsigned int C_col_start,
1034  unsigned int C_row_inc,
1035  unsigned int C_col_inc,
1036  unsigned int C_row_size,
1037  unsigned int C_col_size,
1038  unsigned int C_internal_rows,
1039  unsigned int C_internal_cols)
1040 {
1041 
1042  __shared__ NumericT bufA[272];
1043  __shared__ NumericT bufB[272];
1044 
1045  vcl_size_t block_size = 16;//get_local_size(0);
1046  vcl_size_t row_block_id = blockIdx.x;
1047  vcl_size_t col_block_id = blockIdx.y;
1048  vcl_size_t row_thread_id = threadIdx.x;
1049  vcl_size_t col_thread_id = threadIdx.y;
1050  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1051  vcl_size_t aStep = block_size * A_row_inc;
1052  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1053  vcl_size_t bStep = block_size * B_col_inc;
1054  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1055  NumericT Csub = 0;
1056  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1057  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1058 
1059  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1060  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1061  for (vcl_size_t block = 0;
1062  block < block_num;
1063  ++block)
1064  {
1065  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1066  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1067  __syncthreads();
1068  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1069  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1070  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1071  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1072  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1073  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1074  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1075  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1076  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1077  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1078  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1079  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1080  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1081  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1082  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1083  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1084  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1085  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1086  __syncthreads();
1087  aBegin += aStep;
1088  bBegin += bStep;
1089  }
1090  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1091  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1092 }
1093 
1094 
1095 
1097 
1098 
1099 
1100 
1101 // matrix-matrix multiplication C = A * B
1102 // matrix layouts: C...row_major, A...col_major, B...row_major
1103 template<typename NumericT>
1105  NumericT alpha,
1106  const NumericT * A,
1107  unsigned int A_row_start,
1108  unsigned int A_col_start,
1109  unsigned int A_row_inc,
1110  unsigned int A_col_inc,
1111  unsigned int A_row_size,
1112  unsigned int A_col_size,
1113  unsigned int A_internal_rows,
1114  unsigned int A_internal_cols,
1115  const NumericT * B,
1116  unsigned int B_row_start,
1117  unsigned int B_col_start,
1118  unsigned int B_row_inc,
1119  unsigned int B_col_inc,
1120  unsigned int B_row_size,
1121  unsigned int B_col_size,
1122  unsigned int B_internal_rows,
1123  unsigned int B_internal_cols,
1124  NumericT beta,
1125  NumericT * C,
1126  unsigned int C_row_start,
1127  unsigned int C_col_start,
1128  unsigned int C_row_inc,
1129  unsigned int C_col_inc,
1130  unsigned int C_row_size,
1131  unsigned int C_col_size,
1132  unsigned int C_internal_rows,
1133  unsigned int C_internal_cols)
1134 {
1135 
1136  __shared__ NumericT bufA[272];
1137  __shared__ NumericT bufB[272];
1138 
1139  vcl_size_t block_size = 16;//get_local_size(0);
1140  vcl_size_t row_block_id = blockIdx.x;
1141  vcl_size_t col_block_id = blockIdx.y;
1142  vcl_size_t row_thread_id = threadIdx.x;
1143  vcl_size_t col_thread_id = threadIdx.y;
1144  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1145  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1146  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1147  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1148  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1149  NumericT Csub = 0;
1150  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1151  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1152 
1153  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1154  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1155  for (vcl_size_t block = 0;
1156  block < block_num;
1157  ++block)
1158  {
1159  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1160  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1161  __syncthreads();
1162  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1163  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1164  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1165  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1166  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1167  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1168  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1169  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1170  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1171  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1172  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1173  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1174  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1175  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1176  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1177  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1178  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1179  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1180  __syncthreads();
1181  aBegin += aStep;
1182  bBegin += bStep;
1183  }
1184  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1185  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1186 }
1187 
1188 // matrix-matrix multiplication C = A * B^T
1189 // matrix layouts: C...row_major, A...col_major, B...row_major
1190 template<typename NumericT>
1192  NumericT alpha,
1193  const NumericT * A,
1194  unsigned int A_row_start,
1195  unsigned int A_col_start,
1196  unsigned int A_row_inc,
1197  unsigned int A_col_inc,
1198  unsigned int A_row_size,
1199  unsigned int A_col_size,
1200  unsigned int A_internal_rows,
1201  unsigned int A_internal_cols,
1202  const NumericT * B,
1203  unsigned int B_row_start,
1204  unsigned int B_col_start,
1205  unsigned int B_row_inc,
1206  unsigned int B_col_inc,
1207  unsigned int B_row_size,
1208  unsigned int B_col_size,
1209  unsigned int B_internal_rows,
1210  unsigned int B_internal_cols,
1211  NumericT beta,
1212  NumericT * C,
1213  unsigned int C_row_start,
1214  unsigned int C_col_start,
1215  unsigned int C_row_inc,
1216  unsigned int C_col_inc,
1217  unsigned int C_row_size,
1218  unsigned int C_col_size,
1219  unsigned int C_internal_rows,
1220  unsigned int C_internal_cols)
1221 {
1222 
1223  __shared__ NumericT bufA[272];
1224  __shared__ NumericT bufB[272];
1225 
1226  vcl_size_t block_size = 16;//get_local_size(0);
1227  vcl_size_t row_block_id = blockIdx.x;
1228  vcl_size_t col_block_id = blockIdx.y;
1229  vcl_size_t row_thread_id = threadIdx.x;
1230  vcl_size_t col_thread_id = threadIdx.y;
1231  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1232  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1233  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1234  vcl_size_t bStep = block_size * B_col_inc;
1235  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1236  NumericT Csub = 0;
1237  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1238  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1239 
1240  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1241  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1242  for (vcl_size_t block = 0;
1243  block < block_num;
1244  ++block)
1245  {
1246  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1247  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1248  __syncthreads();
1249  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1250  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1251  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1254  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1255  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1256  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1257  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1258  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1259  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1260  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1261  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1262  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1263  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1264  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1265  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1266  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1267  __syncthreads();
1268  aBegin += aStep;
1269  bBegin += bStep;
1270  }
1271  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1272  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1273 }
1274 
1275 // matrix-matrix multiplication C = A^T * B
1276 // matrix layouts: C...row_major, A...col_major, B...row_major
1277 template<typename NumericT>
1279  NumericT alpha,
1280  const NumericT * A,
1281  unsigned int A_row_start,
1282  unsigned int A_col_start,
1283  unsigned int A_row_inc,
1284  unsigned int A_col_inc,
1285  unsigned int A_row_size,
1286  unsigned int A_col_size,
1287  unsigned int A_internal_rows,
1288  unsigned int A_internal_cols,
1289  const NumericT * B,
1290  unsigned int B_row_start,
1291  unsigned int B_col_start,
1292  unsigned int B_row_inc,
1293  unsigned int B_col_inc,
1294  unsigned int B_row_size,
1295  unsigned int B_col_size,
1296  unsigned int B_internal_rows,
1297  unsigned int B_internal_cols,
1298  NumericT beta,
1299  NumericT * C,
1300  unsigned int C_row_start,
1301  unsigned int C_col_start,
1302  unsigned int C_row_inc,
1303  unsigned int C_col_inc,
1304  unsigned int C_row_size,
1305  unsigned int C_col_size,
1306  unsigned int C_internal_rows,
1307  unsigned int C_internal_cols)
1308 {
1309 
1310  __shared__ NumericT bufA[272];
1311  __shared__ NumericT bufB[272];
1312 
1313  vcl_size_t block_size = 16;//get_local_size(0);
1314  vcl_size_t row_block_id = blockIdx.x;
1315  vcl_size_t col_block_id = blockIdx.y;
1316  vcl_size_t row_thread_id = threadIdx.x;
1317  vcl_size_t col_thread_id = threadIdx.y;
1318  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1319  vcl_size_t aStep = block_size * A_row_inc;
1320  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1321  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1322  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1323  NumericT Csub = 0;
1324  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1325  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1326 
1327  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1328  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1329  for (vcl_size_t block = 0;
1330  block < block_num;
1331  ++block)
1332  {
1333  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1334  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1335  __syncthreads();
1336  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1337  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1338  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1341  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1342  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1343  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1344  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1345  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1346  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1347  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1348  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1349  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1350  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1351  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1352  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1353  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1354  __syncthreads();
1355  aBegin += aStep;
1356  bBegin += bStep;
1357  }
1358  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1359  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1360 }
1361 
1362 // matrix-matrix multiplication C = A^T * B^T
1363 // matrix layouts: C...row_major, A...col_major, B...row_major
1364 template<typename NumericT>
1366  NumericT alpha,
1367  const NumericT * A,
1368  unsigned int A_row_start,
1369  unsigned int A_col_start,
1370  unsigned int A_row_inc,
1371  unsigned int A_col_inc,
1372  unsigned int A_row_size,
1373  unsigned int A_col_size,
1374  unsigned int A_internal_rows,
1375  unsigned int A_internal_cols,
1376  const NumericT * B,
1377  unsigned int B_row_start,
1378  unsigned int B_col_start,
1379  unsigned int B_row_inc,
1380  unsigned int B_col_inc,
1381  unsigned int B_row_size,
1382  unsigned int B_col_size,
1383  unsigned int B_internal_rows,
1384  unsigned int B_internal_cols,
1385  NumericT beta,
1386  NumericT * C,
1387  unsigned int C_row_start,
1388  unsigned int C_col_start,
1389  unsigned int C_row_inc,
1390  unsigned int C_col_inc,
1391  unsigned int C_row_size,
1392  unsigned int C_col_size,
1393  unsigned int C_internal_rows,
1394  unsigned int C_internal_cols)
1395 {
1396 
1397  __shared__ NumericT bufA[272];
1398  __shared__ NumericT bufB[272];
1399 
1400  vcl_size_t block_size = 16;//get_local_size(0);
1401  vcl_size_t row_block_id = blockIdx.x;
1402  vcl_size_t col_block_id = blockIdx.y;
1403  vcl_size_t row_thread_id = threadIdx.x;
1404  vcl_size_t col_thread_id = threadIdx.y;
1405  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1406  vcl_size_t aStep = block_size * A_row_inc;
1407  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1408  vcl_size_t bStep = block_size * B_col_inc;
1409  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1410  NumericT Csub = 0;
1411  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1412  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1413 
1414  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1415  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1416  for (vcl_size_t block = 0;
1417  block < block_num;
1418  ++block)
1419  {
1420  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1421  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1422  __syncthreads();
1423  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1424  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1425  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1428  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1429  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1430  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1431  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1432  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1433  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1434  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1435  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1436  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1437  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1438  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1439  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1440  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1441  __syncthreads();
1442  aBegin += aStep;
1443  bBegin += bStep;
1444  }
1445  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1446  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1447 }
1448 
1449 
1450 
1451 
1452 
1454 
1455 
1456 
1457 
1458 
1459 
1460 // matrix-matrix multiplication C = A * B
1461 // matrix layouts: C...col_major, A...row_major, B...col_major
1462 template<typename NumericT>
1464  NumericT alpha,
1465  const NumericT * A,
1466  unsigned int A_row_start,
1467  unsigned int A_col_start,
1468  unsigned int A_row_inc,
1469  unsigned int A_col_inc,
1470  unsigned int A_row_size,
1471  unsigned int A_col_size,
1472  unsigned int A_internal_rows,
1473  unsigned int A_internal_cols,
1474  const NumericT * B,
1475  unsigned int B_row_start,
1476  unsigned int B_col_start,
1477  unsigned int B_row_inc,
1478  unsigned int B_col_inc,
1479  unsigned int B_row_size,
1480  unsigned int B_col_size,
1481  unsigned int B_internal_rows,
1482  unsigned int B_internal_cols,
1483  NumericT beta,
1484  NumericT * C,
1485  unsigned int C_row_start,
1486  unsigned int C_col_start,
1487  unsigned int C_row_inc,
1488  unsigned int C_col_inc,
1489  unsigned int C_row_size,
1490  unsigned int C_col_size,
1491  unsigned int C_internal_rows,
1492  unsigned int C_internal_cols)
1493 {
1494 
1495  __shared__ NumericT bufA[272];
1496  __shared__ NumericT bufB[272];
1497 
1498  vcl_size_t block_size = 16;//get_local_size(0);
1499  vcl_size_t row_block_id = blockIdx.x;
1500  vcl_size_t col_block_id = blockIdx.y;
1501  vcl_size_t row_thread_id = threadIdx.x;
1502  vcl_size_t col_thread_id = threadIdx.y;
1503  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1504  vcl_size_t aStep = block_size * A_col_inc;
1505  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1506  vcl_size_t bStep = block_size * B_row_inc;
1507  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1508  NumericT Csub = 0;
1509  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1510  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1511 
1512  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1513  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1514  for (vcl_size_t block = 0;
1515  block < block_num;
1516  ++block)
1517  {
1518  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1519  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1520  __syncthreads();
1521  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1522  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1523  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1524  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1525  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1526  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1527  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1528  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1529  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1530  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1531  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1532  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1533  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1534  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1535  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1536  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1537  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1538  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1539  __syncthreads();
1540  aBegin += aStep;
1541  bBegin += bStep;
1542  }
1543  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1544  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1545 }
1546 
1547 // matrix-matrix multiplication C = A * B^T
1548 // matrix layouts: C...col_major, A...row_major, B...col_major
1549 template<typename NumericT>
1551  NumericT alpha,
1552  const NumericT * A,
1553  unsigned int A_row_start,
1554  unsigned int A_col_start,
1555  unsigned int A_row_inc,
1556  unsigned int A_col_inc,
1557  unsigned int A_row_size,
1558  unsigned int A_col_size,
1559  unsigned int A_internal_rows,
1560  unsigned int A_internal_cols,
1561  const NumericT * B,
1562  unsigned int B_row_start,
1563  unsigned int B_col_start,
1564  unsigned int B_row_inc,
1565  unsigned int B_col_inc,
1566  unsigned int B_row_size,
1567  unsigned int B_col_size,
1568  unsigned int B_internal_rows,
1569  unsigned int B_internal_cols,
1570  NumericT beta,
1571  NumericT * C,
1572  unsigned int C_row_start,
1573  unsigned int C_col_start,
1574  unsigned int C_row_inc,
1575  unsigned int C_col_inc,
1576  unsigned int C_row_size,
1577  unsigned int C_col_size,
1578  unsigned int C_internal_rows,
1579  unsigned int C_internal_cols)
1580 {
1581 
1582  __shared__ NumericT bufA[272];
1583  __shared__ NumericT bufB[272];
1584 
1585  vcl_size_t block_size = 16;//get_local_size(0);
1586  vcl_size_t row_block_id = blockIdx.x;
1587  vcl_size_t col_block_id = blockIdx.y;
1588  vcl_size_t row_thread_id = threadIdx.x;
1589  vcl_size_t col_thread_id = threadIdx.y;
1590  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1591  vcl_size_t aStep = block_size * A_col_inc;
1592  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1593  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1594  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1595  NumericT Csub = 0;
1596  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1597  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1598 
1599  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1600  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1601  for (vcl_size_t block = 0;
1602  block < block_num;
1603  ++block)
1604  {
1605  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1606  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1607  __syncthreads();
1608  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1609  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1610  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1611  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1612  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1613  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1614  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1615  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1616  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1617  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1618  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1619  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1620  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1621  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1622  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1623  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1624  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1625  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1626  __syncthreads();
1627  aBegin += aStep;
1628  bBegin += bStep;
1629  }
1630  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1631  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1632 }
1633 
1634 // matrix-matrix multiplication C = A^T * B
1635 // matrix layouts: C...col_major, A...row_major, B...col_major
1636 template<typename NumericT>
1638  NumericT alpha,
1639  const NumericT * A,
1640  unsigned int A_row_start,
1641  unsigned int A_col_start,
1642  unsigned int A_row_inc,
1643  unsigned int A_col_inc,
1644  unsigned int A_row_size,
1645  unsigned int A_col_size,
1646  unsigned int A_internal_rows,
1647  unsigned int A_internal_cols,
1648  const NumericT * B,
1649  unsigned int B_row_start,
1650  unsigned int B_col_start,
1651  unsigned int B_row_inc,
1652  unsigned int B_col_inc,
1653  unsigned int B_row_size,
1654  unsigned int B_col_size,
1655  unsigned int B_internal_rows,
1656  unsigned int B_internal_cols,
1657  NumericT beta,
1658  NumericT * C,
1659  unsigned int C_row_start,
1660  unsigned int C_col_start,
1661  unsigned int C_row_inc,
1662  unsigned int C_col_inc,
1663  unsigned int C_row_size,
1664  unsigned int C_col_size,
1665  unsigned int C_internal_rows,
1666  unsigned int C_internal_cols)
1667 {
1668 
1669  __shared__ NumericT bufA[272];
1670  __shared__ NumericT bufB[272];
1671 
1672  vcl_size_t block_size = 16;//get_local_size(0);
1673  vcl_size_t row_block_id = blockIdx.x;
1674  vcl_size_t col_block_id = blockIdx.y;
1675  vcl_size_t row_thread_id = threadIdx.x;
1676  vcl_size_t col_thread_id = threadIdx.y;
1677  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1678  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1679  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1680  vcl_size_t bStep = block_size * B_row_inc;
1681  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1682  NumericT Csub = 0;
1683  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1684  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1685 
1686  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1687  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1688  for (vcl_size_t block = 0;
1689  block < block_num;
1690  ++block)
1691  {
1692  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1693  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1694  __syncthreads();
1695  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1696  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1697  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1698  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1699  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1700  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1701  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1702  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1703  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1704  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1705  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1706  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1707  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1708  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1709  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1710  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1711  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1712  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1713  __syncthreads();
1714  aBegin += aStep;
1715  bBegin += bStep;
1716  }
1717  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1718  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1719 }
1720 
1721 // matrix-matrix multiplication C = A^T * B^T
1722 // matrix layouts: C...col_major, A...row_major, B...col_major
1723 template<typename NumericT>
1725  NumericT alpha,
1726  const NumericT * A,
1727  unsigned int A_row_start,
1728  unsigned int A_col_start,
1729  unsigned int A_row_inc,
1730  unsigned int A_col_inc,
1731  unsigned int A_row_size,
1732  unsigned int A_col_size,
1733  unsigned int A_internal_rows,
1734  unsigned int A_internal_cols,
1735  const NumericT * B,
1736  unsigned int B_row_start,
1737  unsigned int B_col_start,
1738  unsigned int B_row_inc,
1739  unsigned int B_col_inc,
1740  unsigned int B_row_size,
1741  unsigned int B_col_size,
1742  unsigned int B_internal_rows,
1743  unsigned int B_internal_cols,
1744  NumericT beta,
1745  NumericT * C,
1746  unsigned int C_row_start,
1747  unsigned int C_col_start,
1748  unsigned int C_row_inc,
1749  unsigned int C_col_inc,
1750  unsigned int C_row_size,
1751  unsigned int C_col_size,
1752  unsigned int C_internal_rows,
1753  unsigned int C_internal_cols)
1754 {
1755 
1756  __shared__ NumericT bufA[272];
1757  __shared__ NumericT bufB[272];
1758 
1759  vcl_size_t block_size = 16;//get_local_size(0);
1760  vcl_size_t row_block_id = blockIdx.x;
1761  vcl_size_t col_block_id = blockIdx.y;
1762  vcl_size_t row_thread_id = threadIdx.x;
1763  vcl_size_t col_thread_id = threadIdx.y;
1764  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1765  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1766  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1767  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1768  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1769  NumericT Csub = 0;
1770  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1771  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1772 
1773  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1774  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1775  for (vcl_size_t block = 0;
1776  block < block_num;
1777  ++block)
1778  {
1779  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1780  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1781  __syncthreads();
1782  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1783  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1784  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1785  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1786  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1787  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1788  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1789  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1790  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1791  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1792  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1793  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1794  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1795  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1796  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1797  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1798  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1799  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1800  __syncthreads();
1801  aBegin += aStep;
1802  bBegin += bStep;
1803  }
1804  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1805  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1806 }
1807 
1808 
1809 
1810 
1812 
1813 
1814 
1815 
1816 // matrix-matrix multiplication C = A * B
1817 // matrix layouts: C...row_major, A...row_major, B...col_major
1818 template<typename NumericT>
1820  NumericT alpha,
1821  const NumericT * A,
1822  unsigned int A_row_start,
1823  unsigned int A_col_start,
1824  unsigned int A_row_inc,
1825  unsigned int A_col_inc,
1826  unsigned int A_row_size,
1827  unsigned int A_col_size,
1828  unsigned int A_internal_rows,
1829  unsigned int A_internal_cols,
1830  const NumericT * B,
1831  unsigned int B_row_start,
1832  unsigned int B_col_start,
1833  unsigned int B_row_inc,
1834  unsigned int B_col_inc,
1835  unsigned int B_row_size,
1836  unsigned int B_col_size,
1837  unsigned int B_internal_rows,
1838  unsigned int B_internal_cols,
1839  NumericT beta,
1840  NumericT * C,
1841  unsigned int C_row_start,
1842  unsigned int C_col_start,
1843  unsigned int C_row_inc,
1844  unsigned int C_col_inc,
1845  unsigned int C_row_size,
1846  unsigned int C_col_size,
1847  unsigned int C_internal_rows,
1848  unsigned int C_internal_cols)
1849 {
1850 
1851  __shared__ NumericT bufA[272];
1852  __shared__ NumericT bufB[272];
1853 
1854  vcl_size_t block_size = 16;//get_local_size(0);
1855  vcl_size_t row_block_id = blockIdx.x;
1856  vcl_size_t col_block_id = blockIdx.y;
1857  vcl_size_t row_thread_id = threadIdx.x;
1858  vcl_size_t col_thread_id = threadIdx.y;
1859  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1860  vcl_size_t aStep = block_size * A_col_inc;
1861  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1862  vcl_size_t bStep = block_size * B_row_inc;
1863  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1864  NumericT Csub = 0;
1865  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1866  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1867 
1868  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1869  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1870  for (vcl_size_t block = 0;
1871  block < block_num;
1872  ++block)
1873  {
1874  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1875  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1876  __syncthreads();
1877  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1878  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1879  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1880  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1881  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1882  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1883  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1884  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1885  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1886  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1887  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1888  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1889  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1890  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1891  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1892  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1893  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1894  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1895  __syncthreads();
1896  aBegin += aStep;
1897  bBegin += bStep;
1898  }
1899  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1900  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1901 }
1902 
1903 // matrix-matrix multiplication C = A * B^T
1904 // matrix layouts: C...row_major, A...row_major, B...col_major
1905 template<typename NumericT>
1907  NumericT alpha,
1908  const NumericT * A,
1909  unsigned int A_row_start,
1910  unsigned int A_col_start,
1911  unsigned int A_row_inc,
1912  unsigned int A_col_inc,
1913  unsigned int A_row_size,
1914  unsigned int A_col_size,
1915  unsigned int A_internal_rows,
1916  unsigned int A_internal_cols,
1917  const NumericT * B,
1918  unsigned int B_row_start,
1919  unsigned int B_col_start,
1920  unsigned int B_row_inc,
1921  unsigned int B_col_inc,
1922  unsigned int B_row_size,
1923  unsigned int B_col_size,
1924  unsigned int B_internal_rows,
1925  unsigned int B_internal_cols,
1926  NumericT beta,
1927  NumericT * C,
1928  unsigned int C_row_start,
1929  unsigned int C_col_start,
1930  unsigned int C_row_inc,
1931  unsigned int C_col_inc,
1932  unsigned int C_row_size,
1933  unsigned int C_col_size,
1934  unsigned int C_internal_rows,
1935  unsigned int C_internal_cols)
1936 {
1937 
1938  __shared__ NumericT bufA[272];
1939  __shared__ NumericT bufB[272];
1940 
1941  vcl_size_t block_size = 16;//get_local_size(0);
1942  vcl_size_t row_block_id = blockIdx.x;
1943  vcl_size_t col_block_id = blockIdx.y;
1944  vcl_size_t row_thread_id = threadIdx.x;
1945  vcl_size_t col_thread_id = threadIdx.y;
1946  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1947  vcl_size_t aStep = block_size * A_col_inc;
1948  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1949  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1950  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1951  NumericT Csub = 0;
1952  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1953  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1954 
1955  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1956  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1957  for (vcl_size_t block = 0;
1958  block < block_num;
1959  ++block)
1960  {
1961  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1962  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1963  __syncthreads();
1964  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1965  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1966  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1967  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1968  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1969  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1970  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1971  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1972  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1973  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1974  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1975  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1976  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1977  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1978  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1979  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1980  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1981  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1982  __syncthreads();
1983  aBegin += aStep;
1984  bBegin += bStep;
1985  }
1986  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1987  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1988 }
1989 
1990 // matrix-matrix multiplication C = A^T * B
1991 // matrix layouts: C...row_major, A...row_major, B...col_major
1992 template<typename NumericT>
1994  NumericT alpha,
1995  const NumericT * A,
1996  unsigned int A_row_start,
1997  unsigned int A_col_start,
1998  unsigned int A_row_inc,
1999  unsigned int A_col_inc,
2000  unsigned int A_row_size,
2001  unsigned int A_col_size,
2002  unsigned int A_internal_rows,
2003  unsigned int A_internal_cols,
2004  const NumericT * B,
2005  unsigned int B_row_start,
2006  unsigned int B_col_start,
2007  unsigned int B_row_inc,
2008  unsigned int B_col_inc,
2009  unsigned int B_row_size,
2010  unsigned int B_col_size,
2011  unsigned int B_internal_rows,
2012  unsigned int B_internal_cols,
2013  NumericT beta,
2014  NumericT * C,
2015  unsigned int C_row_start,
2016  unsigned int C_col_start,
2017  unsigned int C_row_inc,
2018  unsigned int C_col_inc,
2019  unsigned int C_row_size,
2020  unsigned int C_col_size,
2021  unsigned int C_internal_rows,
2022  unsigned int C_internal_cols)
2023 {
2024 
2025  __shared__ NumericT bufA[272];
2026  __shared__ NumericT bufB[272];
2027 
2028  vcl_size_t block_size = 16;//get_local_size(0);
2029  vcl_size_t row_block_id = blockIdx.x;
2030  vcl_size_t col_block_id = blockIdx.y;
2031  vcl_size_t row_thread_id = threadIdx.x;
2032  vcl_size_t col_thread_id = threadIdx.y;
2033  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2034  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2035  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
2036  vcl_size_t bStep = block_size * B_row_inc;
2037  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2038  NumericT Csub = 0;
2039  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2040  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2041 
2042  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2043  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2044  for (vcl_size_t block = 0;
2045  block < block_num;
2046  ++block)
2047  {
2048  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2049  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2050  __syncthreads();
2051  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2052  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2053  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2054  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2055  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2056  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2057  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2058  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2059  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2060  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2061  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2062  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2063  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2064  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2065  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2066  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2067  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2068  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2069  __syncthreads();
2070  aBegin += aStep;
2071  bBegin += bStep;
2072  }
2073  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2074  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2075 }
2076 
2077 // matrix-matrix multiplication C = A^T * B^T
2078 // matrix layouts: C...row_major, A...row_major, B...col_major
2079 template<typename NumericT>
2081  NumericT alpha,
2082  const NumericT * A,
2083  unsigned int A_row_start,
2084  unsigned int A_col_start,
2085  unsigned int A_row_inc,
2086  unsigned int A_col_inc,
2087  unsigned int A_row_size,
2088  unsigned int A_col_size,
2089  unsigned int A_internal_rows,
2090  unsigned int A_internal_cols,
2091  const NumericT * B,
2092  unsigned int B_row_start,
2093  unsigned int B_col_start,
2094  unsigned int B_row_inc,
2095  unsigned int B_col_inc,
2096  unsigned int B_row_size,
2097  unsigned int B_col_size,
2098  unsigned int B_internal_rows,
2099  unsigned int B_internal_cols,
2100  NumericT beta,
2101  NumericT * C,
2102  unsigned int C_row_start,
2103  unsigned int C_col_start,
2104  unsigned int C_row_inc,
2105  unsigned int C_col_inc,
2106  unsigned int C_row_size,
2107  unsigned int C_col_size,
2108  unsigned int C_internal_rows,
2109  unsigned int C_internal_cols)
2110 {
2111 
2112  __shared__ NumericT bufA[272];
2113  __shared__ NumericT bufB[272];
2114 
2115  vcl_size_t block_size = 16;//get_local_size(0);
2116  vcl_size_t row_block_id = blockIdx.x;
2117  vcl_size_t col_block_id = blockIdx.y;
2118  vcl_size_t row_thread_id = threadIdx.x;
2119  vcl_size_t col_thread_id = threadIdx.y;
2120  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2121  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2122  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
2123  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
2124  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2125  NumericT Csub = 0;
2126  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2127  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2128 
2129  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2130  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2131  for (vcl_size_t block = 0;
2132  block < block_num;
2133  ++block)
2134  {
2135  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2136  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2137  __syncthreads();
2138  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2139  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2140  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2141  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2142  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2143  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2144  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2145  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2146  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2147  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2148  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2149  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2150  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2151  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2152  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2153  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2154  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2155  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2156  __syncthreads();
2157  aBegin += aStep;
2158  bBegin += bStep;
2159  }
2160  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2161  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2162 }
2163 
2164 
2165 
2166 
2167 
2169 
2170 
2171 
2172 
2173 
2174 
2175 // matrix-matrix multiplication C = A * B
2176 // matrix layouts: C...col_major, A...row_major, B...row_major
2177 template<typename NumericT>
2179  NumericT alpha,
2180  const NumericT * A,
2181  unsigned int A_row_start,
2182  unsigned int A_col_start,
2183  unsigned int A_row_inc,
2184  unsigned int A_col_inc,
2185  unsigned int A_row_size,
2186  unsigned int A_col_size,
2187  unsigned int A_internal_rows,
2188  unsigned int A_internal_cols,
2189  const NumericT * B,
2190  unsigned int B_row_start,
2191  unsigned int B_col_start,
2192  unsigned int B_row_inc,
2193  unsigned int B_col_inc,
2194  unsigned int B_row_size,
2195  unsigned int B_col_size,
2196  unsigned int B_internal_rows,
2197  unsigned int B_internal_cols,
2198  NumericT beta,
2199  NumericT * C,
2200  unsigned int C_row_start,
2201  unsigned int C_col_start,
2202  unsigned int C_row_inc,
2203  unsigned int C_col_inc,
2204  unsigned int C_row_size,
2205  unsigned int C_col_size,
2206  unsigned int C_internal_rows,
2207  unsigned int C_internal_cols)
2208 {
2209 
2210  __shared__ NumericT bufA[272];
2211  __shared__ NumericT bufB[272];
2212 
2213  vcl_size_t block_size = 16;//get_local_size(0);
2214  vcl_size_t row_block_id = blockIdx.x;
2215  vcl_size_t col_block_id = blockIdx.y;
2216  vcl_size_t row_thread_id = threadIdx.x;
2217  vcl_size_t col_thread_id = threadIdx.y;
2218  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2219  vcl_size_t aStep = block_size * A_col_inc;
2220  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2221  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2222  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2223  NumericT Csub = 0;
2224  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2225  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2226 
2227  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2228  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2229  for (vcl_size_t block = 0;
2230  block < block_num;
2231  ++block)
2232  {
2233  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2234  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2235  __syncthreads();
2236  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2237  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2238  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2239  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2240  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2241  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2242  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2243  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2244  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2245  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2246  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2247  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2248  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2249  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2250  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2251  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2254  __syncthreads();
2255  aBegin += aStep;
2256  bBegin += bStep;
2257  }
2258  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2259  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2260 }
2261 
2262 // matrix-matrix multiplication C = A * B^T
2263 // matrix layouts: C...col_major, A...row_major, B...row_major
2264 template<typename NumericT>
2266  NumericT alpha,
2267  const NumericT * A,
2268  unsigned int A_row_start,
2269  unsigned int A_col_start,
2270  unsigned int A_row_inc,
2271  unsigned int A_col_inc,
2272  unsigned int A_row_size,
2273  unsigned int A_col_size,
2274  unsigned int A_internal_rows,
2275  unsigned int A_internal_cols,
2276  const NumericT * B,
2277  unsigned int B_row_start,
2278  unsigned int B_col_start,
2279  unsigned int B_row_inc,
2280  unsigned int B_col_inc,
2281  unsigned int B_row_size,
2282  unsigned int B_col_size,
2283  unsigned int B_internal_rows,
2284  unsigned int B_internal_cols,
2285  NumericT beta,
2286  NumericT * C,
2287  unsigned int C_row_start,
2288  unsigned int C_col_start,
2289  unsigned int C_row_inc,
2290  unsigned int C_col_inc,
2291  unsigned int C_row_size,
2292  unsigned int C_col_size,
2293  unsigned int C_internal_rows,
2294  unsigned int C_internal_cols)
2295 {
2296 
2297  __shared__ NumericT bufA[272];
2298  __shared__ NumericT bufB[272];
2299 
2300  vcl_size_t block_size = 16;//get_local_size(0);
2301  vcl_size_t row_block_id = blockIdx.x;
2302  vcl_size_t col_block_id = blockIdx.y;
2303  vcl_size_t row_thread_id = threadIdx.x;
2304  vcl_size_t col_thread_id = threadIdx.y;
2305  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2306  vcl_size_t aStep = block_size * A_col_inc;
2307  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2308  vcl_size_t bStep = block_size * B_col_inc;
2309  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2310  NumericT Csub = 0;
2311  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2312  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2313 
2314  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2315  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2316  for (vcl_size_t block = 0;
2317  block < block_num;
2318  ++block)
2319  {
2320  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2321  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2322  __syncthreads();
2323  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2324  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2325  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2326  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2327  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2328  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2329  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2330  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2331  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2332  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2333  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2334  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2335  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2336  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2337  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2338  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2341  __syncthreads();
2342  aBegin += aStep;
2343  bBegin += bStep;
2344  }
2345  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2346  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2347 }
2348 
2349 // matrix-matrix multiplication C = A^T * B
2350 // matrix layouts: C...col_major, A...row_major, B...row_major
2351 template<typename NumericT>
2353  NumericT alpha,
2354  const NumericT * A,
2355  unsigned int A_row_start,
2356  unsigned int A_col_start,
2357  unsigned int A_row_inc,
2358  unsigned int A_col_inc,
2359  unsigned int A_row_size,
2360  unsigned int A_col_size,
2361  unsigned int A_internal_rows,
2362  unsigned int A_internal_cols,
2363  const NumericT * B,
2364  unsigned int B_row_start,
2365  unsigned int B_col_start,
2366  unsigned int B_row_inc,
2367  unsigned int B_col_inc,
2368  unsigned int B_row_size,
2369  unsigned int B_col_size,
2370  unsigned int B_internal_rows,
2371  unsigned int B_internal_cols,
2372  NumericT beta,
2373  NumericT * C,
2374  unsigned int C_row_start,
2375  unsigned int C_col_start,
2376  unsigned int C_row_inc,
2377  unsigned int C_col_inc,
2378  unsigned int C_row_size,
2379  unsigned int C_col_size,
2380  unsigned int C_internal_rows,
2381  unsigned int C_internal_cols)
2382 {
2383 
2384  __shared__ NumericT bufA[272];
2385  __shared__ NumericT bufB[272];
2386 
2387  vcl_size_t block_size = 16;//get_local_size(0);
2388  vcl_size_t row_block_id = blockIdx.x;
2389  vcl_size_t col_block_id = blockIdx.y;
2390  vcl_size_t row_thread_id = threadIdx.x;
2391  vcl_size_t col_thread_id = threadIdx.y;
2392  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2393  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2394  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2395  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2396  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2397  NumericT Csub = 0;
2398  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2399  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2400 
2401  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2402  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2403  for (vcl_size_t block = 0;
2404  block < block_num;
2405  ++block)
2406  {
2407  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2408  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2409  __syncthreads();
2410  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2411  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2412  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2413  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2414  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2415  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2416  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2417  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2418  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2419  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2420  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2421  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2422  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2423  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2424  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2425  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2428  __syncthreads();
2429  aBegin += aStep;
2430  bBegin += bStep;
2431  }
2432  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2433  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2434 }
2435 
2436 // matrix-matrix multiplication C = A^T * B^T
2437 // matrix layouts: C...col_major, A...row_major, B...row_major
2438 template<typename NumericT>
2440  NumericT alpha,
2441  const NumericT * A,
2442  unsigned int A_row_start,
2443  unsigned int A_col_start,
2444  unsigned int A_row_inc,
2445  unsigned int A_col_inc,
2446  unsigned int A_row_size,
2447  unsigned int A_col_size,
2448  unsigned int A_internal_rows,
2449  unsigned int A_internal_cols,
2450  const NumericT * B,
2451  unsigned int B_row_start,
2452  unsigned int B_col_start,
2453  unsigned int B_row_inc,
2454  unsigned int B_col_inc,
2455  unsigned int B_row_size,
2456  unsigned int B_col_size,
2457  unsigned int B_internal_rows,
2458  unsigned int B_internal_cols,
2459  NumericT beta,
2460  NumericT * C,
2461  unsigned int C_row_start,
2462  unsigned int C_col_start,
2463  unsigned int C_row_inc,
2464  unsigned int C_col_inc,
2465  unsigned int C_row_size,
2466  unsigned int C_col_size,
2467  unsigned int C_internal_rows,
2468  unsigned int C_internal_cols)
2469 {
2470 
2471  __shared__ NumericT bufA[272];
2472  __shared__ NumericT bufB[272];
2473 
2474  vcl_size_t block_size = 16;//get_local_size(0);
2475  vcl_size_t row_block_id = blockIdx.x;
2476  vcl_size_t col_block_id = blockIdx.y;
2477  vcl_size_t row_thread_id = threadIdx.x;
2478  vcl_size_t col_thread_id = threadIdx.y;
2479  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2480  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2481  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2482  vcl_size_t bStep = block_size * B_col_inc;
2483  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2484  NumericT Csub = 0;
2485  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2486  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2487 
2488  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2489  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2490  for (vcl_size_t block = 0;
2491  block < block_num;
2492  ++block)
2493  {
2494  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2495  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2496  __syncthreads();
2497  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2498  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2499  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2500  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2501  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2502  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2503  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2504  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2505  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2506  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2507  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2508  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2509  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2510  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2511  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2512  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2513  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2514  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2515  __syncthreads();
2516  aBegin += aStep;
2517  bBegin += bStep;
2518  }
2519  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2520  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2521 }
2522 
2523 
2524 
2525 
2526 
2528 
2529 
2530 
2531 
2532 // matrix-matrix multiplication C = A * B
2533 // matrix layouts: C...row_major, A...row_major, B...row_major
2534 template<typename NumericT>
2536  NumericT alpha,
2537  const NumericT * A,
2538  unsigned int A_row_start,
2539  unsigned int A_col_start,
2540  unsigned int A_row_inc,
2541  unsigned int A_col_inc,
2542  unsigned int A_row_size,
2543  unsigned int A_col_size,
2544  unsigned int A_internal_rows,
2545  unsigned int A_internal_cols,
2546  const NumericT * B,
2547  unsigned int B_row_start,
2548  unsigned int B_col_start,
2549  unsigned int B_row_inc,
2550  unsigned int B_col_inc,
2551  unsigned int B_row_size,
2552  unsigned int B_col_size,
2553  unsigned int B_internal_rows,
2554  unsigned int B_internal_cols,
2555  NumericT beta,
2556  NumericT * C,
2557  unsigned int C_row_start,
2558  unsigned int C_col_start,
2559  unsigned int C_row_inc,
2560  unsigned int C_col_inc,
2561  unsigned int C_row_size,
2562  unsigned int C_col_size,
2563  unsigned int C_internal_rows,
2564  unsigned int C_internal_cols)
2565 {
2566 
2567  __shared__ NumericT bufA[272];
2568  __shared__ NumericT bufB[272];
2569 
2570  vcl_size_t block_size = 16;//get_local_size(0);
2571  vcl_size_t row_block_id = blockIdx.x;
2572  vcl_size_t col_block_id = blockIdx.y;
2573  vcl_size_t row_thread_id = threadIdx.x;
2574  vcl_size_t col_thread_id = threadIdx.y;
2575  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2576  vcl_size_t aStep = block_size * A_col_inc;
2577  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2578  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2579  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2580  NumericT Csub = 0;
2581  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2582  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2583 
2584  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2585  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2586  for (vcl_size_t block = 0;
2587  block < block_num;
2588  ++block)
2589  {
2590  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2591  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2592  __syncthreads();
2593  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2594  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2595  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2596  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2597  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2598  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2599  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2600  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2601  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2602  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2603  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2604  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2605  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2606  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2607  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2608  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2609  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2610  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2611  __syncthreads();
2612  aBegin += aStep;
2613  bBegin += bStep;
2614  }
2615  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2616  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2617 }
2618 
2619 // matrix-matrix multiplication C = A * B^T
2620 // matrix layouts: C...row_major, A...row_major, B...row_major
2621 template<typename NumericT>
2623  NumericT alpha,
2624  const NumericT * A,
2625  unsigned int A_row_start,
2626  unsigned int A_col_start,
2627  unsigned int A_row_inc,
2628  unsigned int A_col_inc,
2629  unsigned int A_row_size,
2630  unsigned int A_col_size,
2631  unsigned int A_internal_rows,
2632  unsigned int A_internal_cols,
2633  const NumericT * B,
2634  unsigned int B_row_start,
2635  unsigned int B_col_start,
2636  unsigned int B_row_inc,
2637  unsigned int B_col_inc,
2638  unsigned int B_row_size,
2639  unsigned int B_col_size,
2640  unsigned int B_internal_rows,
2641  unsigned int B_internal_cols,
2642  NumericT beta,
2643  NumericT * C,
2644  unsigned int C_row_start,
2645  unsigned int C_col_start,
2646  unsigned int C_row_inc,
2647  unsigned int C_col_inc,
2648  unsigned int C_row_size,
2649  unsigned int C_col_size,
2650  unsigned int C_internal_rows,
2651  unsigned int C_internal_cols)
2652 {
2653 
2654  __shared__ NumericT bufA[272];
2655  __shared__ NumericT bufB[272];
2656 
2657  vcl_size_t block_size = 16;//get_local_size(0);
2658  vcl_size_t row_block_id = blockIdx.x;
2659  vcl_size_t col_block_id = blockIdx.y;
2660  vcl_size_t row_thread_id = threadIdx.x;
2661  vcl_size_t col_thread_id = threadIdx.y;
2662  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2663  vcl_size_t aStep = block_size * A_col_inc;
2664  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2665  vcl_size_t bStep = block_size * B_col_inc;
2666  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2667  NumericT Csub = 0;
2668  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2669  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2670 
2671  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2672  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2673  for (vcl_size_t block = 0;
2674  block < block_num;
2675  ++block)
2676  {
2677  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2678  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2679  __syncthreads();
2680  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2681  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2682  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2683  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2684  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2685  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2686  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2687  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2688  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2689  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2690  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2691  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2692  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2693  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2694  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2695  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2696  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2697  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2698  __syncthreads();
2699  aBegin += aStep;
2700  bBegin += bStep;
2701  }
2702  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2703  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2704 }
2705 
2706 // matrix-matrix multiplication C = A^T * B
2707 // matrix layouts: C...row_major, A...row_major, B...row_major
2708 template<typename NumericT>
2710  NumericT alpha,
2711  const NumericT * A,
2712  unsigned int A_row_start,
2713  unsigned int A_col_start,
2714  unsigned int A_row_inc,
2715  unsigned int A_col_inc,
2716  unsigned int A_row_size,
2717  unsigned int A_col_size,
2718  unsigned int A_internal_rows,
2719  unsigned int A_internal_cols,
2720  const NumericT * B,
2721  unsigned int B_row_start,
2722  unsigned int B_col_start,
2723  unsigned int B_row_inc,
2724  unsigned int B_col_inc,
2725  unsigned int B_row_size,
2726  unsigned int B_col_size,
2727  unsigned int B_internal_rows,
2728  unsigned int B_internal_cols,
2729  NumericT beta,
2730  NumericT * C,
2731  unsigned int C_row_start,
2732  unsigned int C_col_start,
2733  unsigned int C_row_inc,
2734  unsigned int C_col_inc,
2735  unsigned int C_row_size,
2736  unsigned int C_col_size,
2737  unsigned int C_internal_rows,
2738  unsigned int C_internal_cols)
2739 {
2740 
2741  __shared__ NumericT bufA[272];
2742  __shared__ NumericT bufB[272];
2743 
2744  vcl_size_t block_size = 16;//get_local_size(0);
2745  vcl_size_t row_block_id = blockIdx.x;
2746  vcl_size_t col_block_id = blockIdx.y;
2747  vcl_size_t row_thread_id = threadIdx.x;
2748  vcl_size_t col_thread_id = threadIdx.y;
2749  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2750  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2751  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2752  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2753  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2754  NumericT Csub = 0;
2755  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2756  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2757 
2758  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2759  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2760  for (vcl_size_t block = 0;
2761  block < block_num;
2762  ++block)
2763  {
2764  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2765  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2766  __syncthreads();
2767  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2768  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2769  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2770  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2771  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2772  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2773  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2774  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2775  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2776  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2777  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2778  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2779  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2780  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2781  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2782  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2783  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2784  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2785  __syncthreads();
2786  aBegin += aStep;
2787  bBegin += bStep;
2788  }
2789  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2790  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2791 }
2792 
2793 // matrix-matrix multiplication C = A^T * B^T
2794 // matrix layouts: C...row_major, A...row_major, B...row_major
2795 template<typename NumericT>
2797  NumericT alpha,
2798  const NumericT * A,
2799  unsigned int A_row_start,
2800  unsigned int A_col_start,
2801  unsigned int A_row_inc,
2802  unsigned int A_col_inc,
2803  unsigned int A_row_size,
2804  unsigned int A_col_size,
2805  unsigned int A_internal_rows,
2806  unsigned int A_internal_cols,
2807  const NumericT * B,
2808  unsigned int B_row_start,
2809  unsigned int B_col_start,
2810  unsigned int B_row_inc,
2811  unsigned int B_col_inc,
2812  unsigned int B_row_size,
2813  unsigned int B_col_size,
2814  unsigned int B_internal_rows,
2815  unsigned int B_internal_cols,
2816  NumericT beta,
2817  NumericT * C,
2818  unsigned int C_row_start,
2819  unsigned int C_col_start,
2820  unsigned int C_row_inc,
2821  unsigned int C_col_inc,
2822  unsigned int C_row_size,
2823  unsigned int C_col_size,
2824  unsigned int C_internal_rows,
2825  unsigned int C_internal_cols)
2826 {
2827 
2828  __shared__ NumericT bufA[272];
2829  __shared__ NumericT bufB[272];
2830 
2831  vcl_size_t block_size = 16;//get_local_size(0);
2832  vcl_size_t row_block_id = blockIdx.x;
2833  vcl_size_t col_block_id = blockIdx.y;
2834  vcl_size_t row_thread_id = threadIdx.x;
2835  vcl_size_t col_thread_id = threadIdx.y;
2836  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2837  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2838  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2839  vcl_size_t bStep = block_size * B_col_inc;
2840  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2841  NumericT Csub = 0;
2842  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2843  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2844 
2845  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2846  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2847  for (vcl_size_t block = 0;
2848  block < block_num;
2849  ++block)
2850  {
2851  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2852  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2853  __syncthreads();
2854  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2855  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2856  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2857  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2858  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2859  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2860  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2861  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2862  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2863  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2864  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2865  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2866  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2867  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2868  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2869  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2870  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2871  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2872  __syncthreads();
2873  aBegin += aStep;
2874  bBegin += bStep;
2875  }
2876  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2877  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2878 }
2879 
2880 
2881 } // namespace cuda
2882 } //namespace linalg
2883 } //namespace viennacl
2884 
2885 
2886 #endif
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:74
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)