ViennaCL - The Vienna Computing Library  1.6.0
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations_row.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28 namespace linalg
29 {
30 namespace cuda
31 {
32 
33 inline __device__ unsigned int row_major_index(unsigned int row,unsigned int col, unsigned int /*num_rows*/,unsigned int num_cols)
34 {
35  return row*num_cols+col;
36 }
37 inline __device__ unsigned int col_major_index(unsigned int row,unsigned int col, unsigned int num_rows, unsigned int /*num_cols*/)
38 {
39  return row+col*num_rows;
40 }
41 //Matrix transpose kernel
42 template<typename NumericT>
43 __global__ void trans_kernel(
44  const NumericT * A,
45  unsigned int A_start1, unsigned int A_start2,
46  unsigned int A_internal_size1, unsigned int A_internal_size2,
47  unsigned int A_size1, unsigned int A_size2,
48  unsigned int A_stride1, unsigned int A_stride2,
49 
50  NumericT * B,
51  unsigned int B_start1, unsigned int B_start2,
52  unsigned int B_internal_size1, unsigned int B_internal_size2,
53  unsigned int B_stride1, unsigned int B_stride2,
54  bool data_major)
55 {
56  unsigned int size = A_internal_size2*A_internal_size1;
57  for(unsigned int i = blockIdx.x; i<size/gridDim.x; i+=gridDim.x)
58  {
59  unsigned int matrix_index = i * blockDim.x + threadIdx.x;
60 
61  unsigned int row = matrix_index / A_internal_size2;
62  unsigned int col = matrix_index % A_internal_size2;
63  if (row < A_size1 && col < A_size2)
64  {
65  if(data_major)
66  {
67  unsigned int pos = row_major_index(A_start1 + A_stride1 * row,
68  A_start2 + A_stride2 * col,
69  A_internal_size1, A_internal_size2);
70  unsigned int new_pos = row_major_index(B_start2 + B_stride2 * col,
71  B_start1 + B_stride1 * row,
72  B_internal_size1, B_internal_size2);
73  B[new_pos] = A[pos];
74 
75  } else
76  {
77  unsigned int pos = col_major_index(A_start1 + A_stride1 * row,
78  A_start2 + A_stride2 * col,
79  A_internal_size1, A_internal_size2);
80  unsigned int new_pos = col_major_index(B_start2 + B_stride2 * col,
81  B_start1 + B_stride1 * row,
82  B_internal_size1, B_internal_size2);
83  B[new_pos] = A[pos];
84 
85  }
86  }
87  }
88 }
89 
90 //
91 // am
92 //
93 
94 // alpha on CPU
95 template<typename NumericT>
96 __global__ void am_row_kernel(
97  NumericT * A,
98  unsigned int A_start1, unsigned int A_start2,
99  unsigned int A_inc1, unsigned int A_inc2,
100  unsigned int A_size1, unsigned int A_size2,
101  unsigned int A_internal_size1, unsigned int A_internal_size2,
102 
103  NumericT fac2,
104  unsigned int options2,
105  const NumericT * B,
106  unsigned int B_start1, unsigned int B_start2,
107  unsigned int B_inc1, unsigned int B_inc2,
108  unsigned int B_internal_size1, unsigned int B_internal_size2)
109 {
110  NumericT alpha = fac2;
111  if (options2 & (1 << 0))
112  alpha = -alpha;
113 
114  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
115  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
116 
117  if (options2 & (1 << 1))
118  {
119  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
120  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
121  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
122  }
123  else
124  {
125  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
126  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
127  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
128  }
129 }
130 
131 // alpha on GPU
132 template<typename NumericT>
133 __global__ void am_row_kernel(
134  NumericT * A,
135  unsigned int A_start1, unsigned int A_start2,
136  unsigned int A_inc1, unsigned int A_inc2,
137  unsigned int A_size1, unsigned int A_size2,
138  unsigned int A_internal_size1, unsigned int A_internal_size2,
139 
140  const NumericT * fac2,
141  unsigned int options2,
142  const NumericT * B,
143  unsigned int B_start1, unsigned int B_start2,
144  unsigned int B_inc1, unsigned int B_inc2,
145  unsigned int B_internal_size1, unsigned int B_internal_size2)
146 {
147  NumericT alpha = *fac2;
148  if (options2 & (1 << 0))
149  alpha = -alpha;
150 
151  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
152  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
153 
154  if (options2 & (1 << 1))
155  {
156  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
157  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
158  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
159  }
160  else
161  {
162  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
163  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
164  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
165  }
166 }
167 
168 
169 //
170 // ambm
171 //
172 
173 // alpha and beta on CPU
174 template<typename NumericT>
175 __global__ void ambm_row_kernel(
176  NumericT * A,
177  unsigned int A_start1, unsigned int A_start2,
178  unsigned int A_inc1, unsigned int A_inc2,
179  unsigned int A_size1, unsigned int A_size2,
180  unsigned int A_internal_size1, unsigned int A_internal_size2,
181 
182  NumericT fac2,
183  unsigned int options2,
184  const NumericT * B,
185  unsigned int B_start1, unsigned int B_start2,
186  unsigned int B_inc1, unsigned int B_inc2,
187  unsigned int B_internal_size1, unsigned int B_internal_size2,
188 
189  NumericT fac3,
190  unsigned int options3,
191  const NumericT * C,
192  unsigned int C_start1, unsigned int C_start2,
193  unsigned int C_inc1, unsigned int C_inc2,
194  unsigned int C_internal_size1, unsigned int C_internal_size2)
195 {
196  NumericT alpha = fac2;
197  if (options2 & (1 << 0))
198  alpha = -alpha;
199 
200  NumericT beta = fac3;
201  if (options3 & (1 << 0))
202  beta = -beta;
203 
204  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
205  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
206 
207  if (options2 & (1 << 1))
208  {
209  if (options3 & (1 << 1))
210  {
211  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
212  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
213  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
214  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
215  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
216  }
217  else
218  {
219  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
220  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
221  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
222  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
223  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
224  }
225  }
226  else
227  {
228  if (options3 & (1 << 1))
229  {
230  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
231  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
232  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
233  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
234  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
235  }
236  else
237  {
238  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
239  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
240  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
241  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
242  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
243  }
244  }
245 }
246 
247 
248 // alpha on CPU, beta on GPU
249 template<typename NumericT>
250 __global__ void ambm_row_kernel(
251  NumericT * A,
252  unsigned int A_start1, unsigned int A_start2,
253  unsigned int A_inc1, unsigned int A_inc2,
254  unsigned int A_size1, unsigned int A_size2,
255  unsigned int A_internal_size1, unsigned int A_internal_size2,
256 
257  NumericT fac2,
258  unsigned int options2,
259  const NumericT * B,
260  unsigned int B_start1, unsigned int B_start2,
261  unsigned int B_inc1, unsigned int B_inc2,
262  unsigned int B_internal_size1, unsigned int B_internal_size2,
263 
264  const NumericT * fac3,
265  unsigned int options3,
266  const NumericT * C,
267  unsigned int C_start1, unsigned int C_start2,
268  unsigned int C_inc1, unsigned int C_inc2,
269  unsigned int C_internal_size1, unsigned int C_internal_size2)
270 {
271  NumericT alpha = fac2;
272  if (options2 & (1 << 0))
273  alpha = -alpha;
274 
275  NumericT beta = *fac3;
276  if (options3 & (1 << 0))
277  beta = -beta;
278 
279  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
280  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
281 
282  if (options2 & (1 << 1))
283  {
284  if (options3 & (1 << 1))
285  {
286  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
287  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
288  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
289  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
290  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
291  }
292  else
293  {
294  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
295  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
296  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
297  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
298  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
299  }
300  }
301  else
302  {
303  if (options3 & (1 << 1))
304  {
305  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
306  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
307  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
308  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
309  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
310  }
311  else
312  {
313  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
314  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
315  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
316  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
317  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
318  }
319  }
320 }
321 
322 // alpha on GPU, beta on CPU
323 template<typename NumericT>
324 __global__ void ambm_row_kernel(
325  NumericT * A,
326  unsigned int A_start1, unsigned int A_start2,
327  unsigned int A_inc1, unsigned int A_inc2,
328  unsigned int A_size1, unsigned int A_size2,
329  unsigned int A_internal_size1, unsigned int A_internal_size2,
330 
331  const NumericT * fac2,
332  unsigned int options2,
333  const NumericT * B,
334  unsigned int B_start1, unsigned int B_start2,
335  unsigned int B_inc1, unsigned int B_inc2,
336  unsigned int B_internal_size1, unsigned int B_internal_size2,
337 
338  NumericT fac3,
339  unsigned int options3,
340  const NumericT * C,
341  unsigned int C_start1, unsigned int C_start2,
342  unsigned int C_inc1, unsigned int C_inc2,
343  unsigned int C_internal_size1, unsigned int C_internal_size2)
344 {
345  NumericT alpha = *fac2;
346  if (options2 & (1 << 0))
347  alpha = -alpha;
348 
349  NumericT beta = fac3;
350  if (options3 & (1 << 0))
351  beta = -beta;
352 
353  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
354  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
355 
356  if (options2 & (1 << 1))
357  {
358  if (options3 & (1 << 1))
359  {
360  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
361  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
362  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
363  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
364  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
365  }
366  else
367  {
368  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
369  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
370  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
371  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
372  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
373  }
374  }
375  else
376  {
377  if (options3 & (1 << 1))
378  {
379  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
380  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
381  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
382  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
383  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
384  }
385  else
386  {
387  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
388  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
389  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
390  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
391  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
392  }
393  }
394 }
395 
396 
397 // alpha and beta on GPU
398 template<typename NumericT>
399 __global__ void ambm_row_kernel(
400  NumericT * A,
401  unsigned int A_start1, unsigned int A_start2,
402  unsigned int A_inc1, unsigned int A_inc2,
403  unsigned int A_size1, unsigned int A_size2,
404  unsigned int A_internal_size1, unsigned int A_internal_size2,
405 
406  const NumericT * fac2,
407  unsigned int options2,
408  const NumericT * B,
409  unsigned int B_start1, unsigned int B_start2,
410  unsigned int B_inc1, unsigned int B_inc2,
411  unsigned int B_internal_size1, unsigned int B_internal_size2,
412 
413  const NumericT * fac3,
414  unsigned int options3,
415  const NumericT * C,
416  unsigned int C_start1, unsigned int C_start2,
417  unsigned int C_inc1, unsigned int C_inc2,
418  unsigned int C_internal_size1, unsigned int C_internal_size2)
419 {
420  NumericT alpha = *fac2;
421  if (options2 & (1 << 0))
422  alpha = -alpha;
423 
424  NumericT beta = *fac3;
425  if (options3 & (1 << 0))
426  beta = -beta;
427 
428  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
429  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
430 
431  if (options2 & (1 << 1))
432  {
433  if (options3 & (1 << 1))
434  {
435  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
436  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
437  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
438  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
439  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
440  }
441  else
442  {
443  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
444  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
445  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
446  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
447  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
448  }
449  }
450  else
451  {
452  if (options3 & (1 << 1))
453  {
454  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
455  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
456  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
457  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
458  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
459  }
460  else
461  {
462  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
463  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
464  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
465  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
466  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
467  }
468  }
469 }
470 
471 
472 //
473 // ambm_m
474 //
475 
476 // alpha and beta on CPU
477 template<typename NumericT>
478 __global__ void ambm_m_row_kernel(
479  NumericT * A,
480  unsigned int A_start1, unsigned int A_start2,
481  unsigned int A_inc1, unsigned int A_inc2,
482  unsigned int A_size1, unsigned int A_size2,
483  unsigned int A_internal_size1, unsigned int A_internal_size2,
484 
485  NumericT fac2,
486  unsigned int options2,
487  const NumericT * B,
488  unsigned int B_start1, unsigned int B_start2,
489  unsigned int B_inc1, unsigned int B_inc2,
490  unsigned int B_internal_size1, unsigned int B_internal_size2,
491 
492  NumericT fac3,
493  unsigned int options3,
494  const NumericT * C,
495  unsigned int C_start1, unsigned int C_start2,
496  unsigned int C_inc1, unsigned int C_inc2,
497  unsigned int C_internal_size1, unsigned int C_internal_size2)
498 {
499  NumericT alpha = fac2;
500  if (options2 & (1 << 0))
501  alpha = -alpha;
502 
503  NumericT beta = fac3;
504  if (options3 & (1 << 0))
505  beta = -beta;
506 
507  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
508  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
509 
510  if (options2 & (1 << 1))
511  {
512  if (options3 & (1 << 1))
513  {
514  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
515  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
516  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
517  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
518  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
519  }
520  else
521  {
522  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
523  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
524  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
525  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
526  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
527  }
528  }
529  else
530  {
531  if (options3 & (1 << 1))
532  {
533  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
534  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
535  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
536  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
537  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
538  }
539  else
540  {
541  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
542  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
543  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
544  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
545  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
546  }
547  }
548 }
549 
550 
551 // alpha on CPU, beta on GPU
552 template<typename NumericT>
553 __global__ void ambm_m_row_kernel(
554  NumericT * A,
555  unsigned int A_start1, unsigned int A_start2,
556  unsigned int A_inc1, unsigned int A_inc2,
557  unsigned int A_size1, unsigned int A_size2,
558  unsigned int A_internal_size1, unsigned int A_internal_size2,
559 
560  NumericT fac2,
561  unsigned int options2,
562  const NumericT * B,
563  unsigned int B_start1, unsigned int B_start2,
564  unsigned int B_inc1, unsigned int B_inc2,
565  unsigned int B_internal_size1, unsigned int B_internal_size2,
566 
567  const NumericT * fac3,
568  unsigned int options3,
569  const NumericT * C,
570  unsigned int C_start1, unsigned int C_start2,
571  unsigned int C_inc1, unsigned int C_inc2,
572  unsigned int C_internal_size1, unsigned int C_internal_size2)
573 {
574  NumericT alpha = fac2;
575  if (options2 & (1 << 0))
576  alpha = -alpha;
577 
578  NumericT beta = *fac3;
579  if (options3 & (1 << 0))
580  beta = -beta;
581 
582  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
583  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
584 
585  if (options2 & (1 << 1))
586  {
587  if (options3 & (1 << 1))
588  {
589  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
590  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
591  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
592  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
593  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
594  }
595  else
596  {
597  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
598  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
599  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
600  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
601  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
602  }
603  }
604  else
605  {
606  if (options3 & (1 << 1))
607  {
608  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
609  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
610  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
611  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
612  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
613  }
614  else
615  {
616  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
617  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
618  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
619  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
620  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
621  }
622  }
623 }
624 
625 // alpha on GPU, beta on CPU
626 template<typename NumericT>
627 __global__ void ambm_m_row_kernel(
628  NumericT * A,
629  unsigned int A_start1, unsigned int A_start2,
630  unsigned int A_inc1, unsigned int A_inc2,
631  unsigned int A_size1, unsigned int A_size2,
632  unsigned int A_internal_size1, unsigned int A_internal_size2,
633 
634  const NumericT * fac2,
635  unsigned int options2,
636  const NumericT * B,
637  unsigned int B_start1, unsigned int B_start2,
638  unsigned int B_inc1, unsigned int B_inc2,
639  unsigned int B_internal_size1, unsigned int B_internal_size2,
640 
641  NumericT fac3,
642  unsigned int options3,
643  const NumericT * C,
644  unsigned int C_start1, unsigned int C_start2,
645  unsigned int C_inc1, unsigned int C_inc2,
646  unsigned int C_internal_size1, unsigned int C_internal_size2)
647 {
648  NumericT alpha = *fac2;
649  if (options2 & (1 << 0))
650  alpha = -alpha;
651 
652  NumericT beta = fac3;
653  if (options3 & (1 << 0))
654  beta = -beta;
655 
656  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
657  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
658 
659  if (options2 & (1 << 1))
660  {
661  if (options3 & (1 << 1))
662  {
663  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
664  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
665  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
666  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
667  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
668  }
669  else
670  {
671  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
672  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
673  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
674  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
675  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
676  }
677  }
678  else
679  {
680  if (options3 & (1 << 1))
681  {
682  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
683  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
684  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
685  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
686  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
687  }
688  else
689  {
690  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
691  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
692  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
693  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
694  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
695  }
696  }
697 }
698 
699 
700 // alpha and beta on GPU
701 template<typename NumericT>
702 __global__ void ambm_m_row_kernel(
703  NumericT * A,
704  unsigned int A_start1, unsigned int A_start2,
705  unsigned int A_inc1, unsigned int A_inc2,
706  unsigned int A_size1, unsigned int A_size2,
707  unsigned int A_internal_size1, unsigned int A_internal_size2,
708 
709  const NumericT * fac2,
710  unsigned int options2,
711  const NumericT * B,
712  unsigned int B_start1, unsigned int B_start2,
713  unsigned int B_inc1, unsigned int B_inc2,
714  unsigned int B_internal_size1, unsigned int B_internal_size2,
715 
716  const NumericT * fac3,
717  unsigned int options3,
718  const NumericT * C,
719  unsigned int C_start1, unsigned int C_start2,
720  unsigned int C_inc1, unsigned int C_inc2,
721  unsigned int C_internal_size1, unsigned int C_internal_size2)
722 {
723  NumericT alpha = *fac2;
724  if (options2 & (1 << 0))
725  alpha = -alpha;
726 
727  NumericT beta = *fac3;
728  if (options3 & (1 << 0))
729  beta = -beta;
730 
731  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
732  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
733 
734  if (options2 & (1 << 1))
735  {
736  if (options3 & (1 << 1))
737  {
738  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
739  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
740  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
741  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
742  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
743  }
744  else
745  {
746  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
747  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
748  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
749  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
750  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
751  }
752  }
753  else
754  {
755  if (options3 & (1 << 1))
756  {
757  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
758  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
759  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
760  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
761  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
762  }
763  else
764  {
765  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
766  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
767  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
768  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
769  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
770  }
771  }
772 }
773 
774 //
775 // assignments
776 //
777 
778 template<typename NumericT>
779 __global__ void matrix_row_assign_kernel(
780  NumericT * A,
781  unsigned int A_start1, unsigned int A_start2,
782  unsigned int A_inc1, unsigned int A_inc2,
783  unsigned int A_size1, unsigned int A_size2,
784  unsigned int A_internal_size1, unsigned int A_internal_size2,
785  NumericT alpha)
786 {
787  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
788  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
789 
790  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
791  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
792  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
793 }
794 
795 
796 template<typename NumericT>
798  NumericT * A,
799  unsigned int A_start1, unsigned int A_start2,
800  unsigned int A_inc1, unsigned int A_inc2,
801  unsigned int A_size1, unsigned int A_size2,
802  unsigned int A_internal_size1, unsigned int A_internal_size2,
803  NumericT alpha)
804 {
805  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
806 
807  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
808  A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
809 }
810 
811 //
812 // binary element-wise operations
813 //
814 
815 template<typename NumericT>
816 __global__ void element_op_row_kernel(
817  NumericT * A,
818  unsigned int A_start1, unsigned int A_start2,
819  unsigned int A_inc1, unsigned int A_inc2,
820  unsigned int A_size1, unsigned int A_size2,
821  unsigned int A_internal_size1, unsigned int A_internal_size2,
822 
823  const NumericT * B,
824  unsigned int B_start1, unsigned int B_start2,
825  unsigned int B_inc1, unsigned int B_inc2,
826  unsigned int B_internal_size1, unsigned int B_internal_size2,
827 
828  const NumericT * C,
829  unsigned int C_start1, unsigned int C_start2,
830  unsigned int C_inc1, unsigned int C_inc2,
831  unsigned int C_internal_size1, unsigned int C_internal_size2,
832 
833  unsigned int op_type) //0: product, 1: division, 2: pow
834 {
835  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
836  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
837 
838  if (op_type == 2)
839  {
840  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
841  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
842  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
843  = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
844  C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
845  }
846  else if (op_type == 1)
847  {
848  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
849  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
850  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
851  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
852  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
853  }
854  else if (op_type == 0)
855  {
856  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
857  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
858  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
859  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
860  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
861  }
862 }
863 
864 template<typename NumericT>
866  NumericT * A,
867  unsigned int A_start1, unsigned int A_start2,
868  unsigned int A_inc1, unsigned int A_inc2,
869  unsigned int A_size1, unsigned int A_size2,
870  unsigned int A_internal_size1, unsigned int A_internal_size2,
871 
872  const NumericT * B,
873  unsigned int B_start1, unsigned int B_start2,
874  unsigned int B_inc1, unsigned int B_inc2,
875  unsigned int B_internal_size1, unsigned int B_internal_size2,
876 
877  const NumericT * C,
878  unsigned int C_start1, unsigned int C_start2,
879  unsigned int C_inc1, unsigned int C_inc2,
880  unsigned int C_internal_size1, unsigned int C_internal_size2,
881 
882  unsigned int op_type) //0: product, 1: division, 2: pow
883 {
884  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
885  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
886 
887  if (op_type == 1)
888  {
889  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
890  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
891  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
892  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
893  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
894  }
895  else if (op_type == 0)
896  {
897  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
898  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
899  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
900  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
901  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
902  }
903 }
904 
905 //
906 // unary element-wise operations
907 //
908 
909 // abs
910 template<typename NumericT>
912  NumericT * A,
913  unsigned int A_start1, unsigned int A_start2,
914  unsigned int A_inc1, unsigned int A_inc2,
915  unsigned int A_size1, unsigned int A_size2,
916  unsigned int A_internal_size1, unsigned int A_internal_size2,
917 
918  const NumericT * B,
919  unsigned int B_start1, unsigned int B_start2,
920  unsigned int B_inc1, unsigned int B_inc2,
921  unsigned int B_internal_size1, unsigned int B_internal_size2)
922 {
923  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
924  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
925 
926  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
927  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
928  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
929 }
930 
931 
932 // acos
933 template<typename NumericT>
935  NumericT * A,
936  unsigned int A_start1, unsigned int A_start2,
937  unsigned int A_inc1, unsigned int A_inc2,
938  unsigned int A_size1, unsigned int A_size2,
939  unsigned int A_internal_size1, unsigned int A_internal_size2,
940 
941  const NumericT * B,
942  unsigned int B_start1, unsigned int B_start2,
943  unsigned int B_inc1, unsigned int B_inc2,
944  unsigned int B_internal_size1, unsigned int B_internal_size2)
945 {
946  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
947  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
948 
949  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
950  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
951  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
952 }
953 
954 
955 // asin
956 template<typename NumericT>
958  NumericT * A,
959  unsigned int A_start1, unsigned int A_start2,
960  unsigned int A_inc1, unsigned int A_inc2,
961  unsigned int A_size1, unsigned int A_size2,
962  unsigned int A_internal_size1, unsigned int A_internal_size2,
963 
964  const NumericT * B,
965  unsigned int B_start1, unsigned int B_start2,
966  unsigned int B_inc1, unsigned int B_inc2,
967  unsigned int B_internal_size1, unsigned int B_internal_size2)
968 {
969  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
970  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
971 
972  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
973  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
974  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
975 }
976 
977 
978 // atan
979 template<typename NumericT>
981  NumericT * A,
982  unsigned int A_start1, unsigned int A_start2,
983  unsigned int A_inc1, unsigned int A_inc2,
984  unsigned int A_size1, unsigned int A_size2,
985  unsigned int A_internal_size1, unsigned int A_internal_size2,
986 
987  const NumericT * B,
988  unsigned int B_start1, unsigned int B_start2,
989  unsigned int B_inc1, unsigned int B_inc2,
990  unsigned int B_internal_size1, unsigned int B_internal_size2)
991 {
992  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
993  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
994 
995  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
996  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
997  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
998 }
999 
1000 
1001 // ceil
1002 template<typename NumericT>
1004  NumericT * A,
1005  unsigned int A_start1, unsigned int A_start2,
1006  unsigned int A_inc1, unsigned int A_inc2,
1007  unsigned int A_size1, unsigned int A_size2,
1008  unsigned int A_internal_size1, unsigned int A_internal_size2,
1009 
1010  const NumericT * B,
1011  unsigned int B_start1, unsigned int B_start2,
1012  unsigned int B_inc1, unsigned int B_inc2,
1013  unsigned int B_internal_size1, unsigned int B_internal_size2)
1014 {
1015  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1016  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1017 
1018  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1019  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1020  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1021 }
1022 
1023 
1024 // cos
1025 template<typename NumericT>
1027  NumericT * A,
1028  unsigned int A_start1, unsigned int A_start2,
1029  unsigned int A_inc1, unsigned int A_inc2,
1030  unsigned int A_size1, unsigned int A_size2,
1031  unsigned int A_internal_size1, unsigned int A_internal_size2,
1032 
1033  const NumericT * B,
1034  unsigned int B_start1, unsigned int B_start2,
1035  unsigned int B_inc1, unsigned int B_inc2,
1036  unsigned int B_internal_size1, unsigned int B_internal_size2)
1037 {
1038  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1039  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1040 
1041  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1042  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1043  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1044 }
1045 
1046 
1047 // cosh
1048 template<typename NumericT>
1050  NumericT * A,
1051  unsigned int A_start1, unsigned int A_start2,
1052  unsigned int A_inc1, unsigned int A_inc2,
1053  unsigned int A_size1, unsigned int A_size2,
1054  unsigned int A_internal_size1, unsigned int A_internal_size2,
1055 
1056  const NumericT * B,
1057  unsigned int B_start1, unsigned int B_start2,
1058  unsigned int B_inc1, unsigned int B_inc2,
1059  unsigned int B_internal_size1, unsigned int B_internal_size2)
1060 {
1061  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1062  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1063 
1064  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1065  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1066  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1067 }
1068 
1069 
1070 // exp
1071 template<typename NumericT>
1073  NumericT * A,
1074  unsigned int A_start1, unsigned int A_start2,
1075  unsigned int A_inc1, unsigned int A_inc2,
1076  unsigned int A_size1, unsigned int A_size2,
1077  unsigned int A_internal_size1, unsigned int A_internal_size2,
1078 
1079  const NumericT * B,
1080  unsigned int B_start1, unsigned int B_start2,
1081  unsigned int B_inc1, unsigned int B_inc2,
1082  unsigned int B_internal_size1, unsigned int B_internal_size2)
1083 {
1084  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1085  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1086 
1087  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1088  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1089  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1090 }
1091 
1092 
1093 // fabs
1094 template<typename NumericT>
1096  NumericT * A,
1097  unsigned int A_start1, unsigned int A_start2,
1098  unsigned int A_inc1, unsigned int A_inc2,
1099  unsigned int A_size1, unsigned int A_size2,
1100  unsigned int A_internal_size1, unsigned int A_internal_size2,
1101 
1102  const NumericT * B,
1103  unsigned int B_start1, unsigned int B_start2,
1104  unsigned int B_inc1, unsigned int B_inc2,
1105  unsigned int B_internal_size1, unsigned int B_internal_size2)
1106 {
1107  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1108  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1109 
1110  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1111  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1112  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1113 }
1114 
1115 
1116 // floor
1117 template<typename NumericT>
1119  NumericT * A,
1120  unsigned int A_start1, unsigned int A_start2,
1121  unsigned int A_inc1, unsigned int A_inc2,
1122  unsigned int A_size1, unsigned int A_size2,
1123  unsigned int A_internal_size1, unsigned int A_internal_size2,
1124 
1125  const NumericT * B,
1126  unsigned int B_start1, unsigned int B_start2,
1127  unsigned int B_inc1, unsigned int B_inc2,
1128  unsigned int B_internal_size1, unsigned int B_internal_size2)
1129 {
1130  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1131  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1132 
1133  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1134  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1135  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1136 }
1137 
1138 
1139 // log
1140 template<typename NumericT>
1142  NumericT * A,
1143  unsigned int A_start1, unsigned int A_start2,
1144  unsigned int A_inc1, unsigned int A_inc2,
1145  unsigned int A_size1, unsigned int A_size2,
1146  unsigned int A_internal_size1, unsigned int A_internal_size2,
1147 
1148  const NumericT * B,
1149  unsigned int B_start1, unsigned int B_start2,
1150  unsigned int B_inc1, unsigned int B_inc2,
1151  unsigned int B_internal_size1, unsigned int B_internal_size2)
1152 {
1153  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1154  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1155 
1156  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1157  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1158  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1159 }
1160 
1161 
1162 // log10
1163 template<typename NumericT>
1165  NumericT * A,
1166  unsigned int A_start1, unsigned int A_start2,
1167  unsigned int A_inc1, unsigned int A_inc2,
1168  unsigned int A_size1, unsigned int A_size2,
1169  unsigned int A_internal_size1, unsigned int A_internal_size2,
1170 
1171  const NumericT * B,
1172  unsigned int B_start1, unsigned int B_start2,
1173  unsigned int B_inc1, unsigned int B_inc2,
1174  unsigned int B_internal_size1, unsigned int B_internal_size2)
1175 {
1176  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1177  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1178 
1179  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1180  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1181  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1182 }
1183 
1184 
1185 // sin
1186 template<typename NumericT>
1188  NumericT * A,
1189  unsigned int A_start1, unsigned int A_start2,
1190  unsigned int A_inc1, unsigned int A_inc2,
1191  unsigned int A_size1, unsigned int A_size2,
1192  unsigned int A_internal_size1, unsigned int A_internal_size2,
1193 
1194  const NumericT * B,
1195  unsigned int B_start1, unsigned int B_start2,
1196  unsigned int B_inc1, unsigned int B_inc2,
1197  unsigned int B_internal_size1, unsigned int B_internal_size2)
1198 {
1199  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1200  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1201 
1202  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1203  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1204  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1205 }
1206 
1207 
1208 // sinh
1209 template<typename NumericT>
1211  NumericT * A,
1212  unsigned int A_start1, unsigned int A_start2,
1213  unsigned int A_inc1, unsigned int A_inc2,
1214  unsigned int A_size1, unsigned int A_size2,
1215  unsigned int A_internal_size1, unsigned int A_internal_size2,
1216 
1217  const NumericT * B,
1218  unsigned int B_start1, unsigned int B_start2,
1219  unsigned int B_inc1, unsigned int B_inc2,
1220  unsigned int B_internal_size1, unsigned int B_internal_size2)
1221 {
1222  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1223  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1224 
1225  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1226  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1227  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1228 }
1229 
1230 
1231 // sqrt
1232 template<typename NumericT>
1234  NumericT * A,
1235  unsigned int A_start1, unsigned int A_start2,
1236  unsigned int A_inc1, unsigned int A_inc2,
1237  unsigned int A_size1, unsigned int A_size2,
1238  unsigned int A_internal_size1, unsigned int A_internal_size2,
1239 
1240  const NumericT * B,
1241  unsigned int B_start1, unsigned int B_start2,
1242  unsigned int B_inc1, unsigned int B_inc2,
1243  unsigned int B_internal_size1, unsigned int B_internal_size2)
1244 {
1245  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1246  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1247 
1248  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1249  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1250  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1251 }
1252 
1253 
1254 // tan
1255 template<typename NumericT>
1257  NumericT * A,
1258  unsigned int A_start1, unsigned int A_start2,
1259  unsigned int A_inc1, unsigned int A_inc2,
1260  unsigned int A_size1, unsigned int A_size2,
1261  unsigned int A_internal_size1, unsigned int A_internal_size2,
1262 
1263  const NumericT * B,
1264  unsigned int B_start1, unsigned int B_start2,
1265  unsigned int B_inc1, unsigned int B_inc2,
1266  unsigned int B_internal_size1, unsigned int B_internal_size2)
1267 {
1268  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1269  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1270 
1271  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1272  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1273  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1274 }
1275 
1276 
1277 // tanh
1278 template<typename NumericT>
1280  NumericT * A,
1281  unsigned int A_start1, unsigned int A_start2,
1282  unsigned int A_inc1, unsigned int A_inc2,
1283  unsigned int A_size1, unsigned int A_size2,
1284  unsigned int A_internal_size1, unsigned int A_internal_size2,
1285 
1286  const NumericT * B,
1287  unsigned int B_start1, unsigned int B_start2,
1288  unsigned int B_inc1, unsigned int B_inc2,
1289  unsigned int B_internal_size1, unsigned int B_internal_size2)
1290 {
1291  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1292  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1293 
1294  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1295  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1296  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1297 }
1298 
1299 
1300 
1301 //
1302 // matrix-vector product
1303 //
1304 
1305 template<typename NumericT>
1306 __global__ void vec_mul_row_kernel(
1307  const NumericT * A,
1308  unsigned int A_row_start,
1309  unsigned int A_col_start,
1310  unsigned int A_row_inc,
1311  unsigned int A_col_inc,
1312  unsigned int A_row_size,
1313  unsigned int A_col_size,
1314  unsigned int A_internal_rows,
1315  unsigned int A_internal_cols,
1316  const NumericT * v,
1317  unsigned int v_start,
1318  unsigned int v_inc,
1319  unsigned int v_size,
1320  NumericT * result,
1321  unsigned int result_start,
1322  unsigned int result_inc,
1323  unsigned int result_size)
1324 {
1325  __shared__ NumericT work[128];
1326 
1327  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1328  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1329  unsigned int lid = threadIdx.x;
1330 
1331  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
1332  {
1333  NumericT dot_prod = 0;
1334  for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1335  dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1336  work[lid] = dot_prod;
1337 
1338  for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
1339  __syncthreads();
1340  if (lid < stride)
1341  work[lid] += work[lid+stride];
1342  }
1343 
1344  if (lid == 0)
1345  result[row * result_inc + result_start] = work[0];
1346  }
1347 }
1348 
1349 
1350 template<typename NumericT>
1352  const NumericT * A,
1353  unsigned int A_row_start,
1354  unsigned int A_col_start,
1355  unsigned int A_row_inc,
1356  unsigned int A_col_inc,
1357  unsigned int A_row_size,
1358  unsigned int A_col_size,
1359  unsigned int A_internal_rows,
1360  unsigned int A_internal_cols,
1361  const NumericT * v,
1362  unsigned int v_start,
1363  unsigned int v_inc,
1364  unsigned int v_size,
1365  NumericT * result,
1366  unsigned int result_start,
1367  unsigned int result_inc,
1368  unsigned int result_size)
1369 {
1370  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
1371  {
1372  NumericT dot_prod = 0;
1373  for (unsigned int col = 0; col < A_row_size; ++col)
1374  dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1375  result[row * result_inc + result_start] = dot_prod;
1376  }
1377 }
1378 
1379 
1380 //
1381 // matrix-matrix products
1382 //
1383 
1384 
1385 
1386 
1387 //
1388 // scaled rank-1-update
1389 //
1390 
1391 // alpha on CPU
1392 template<typename NumericT>
1394  NumericT * A,
1395  unsigned int A_start1, unsigned int A_start2,
1396  unsigned int A_inc1, unsigned int A_inc2,
1397  unsigned int A_size1, unsigned int A_size2,
1398  unsigned int A_internal_size1, unsigned int A_internal_size2,
1399 
1400  NumericT val,
1401  unsigned int options2,
1402 
1403  const NumericT * vec1,
1404  unsigned int start1,
1405  unsigned int inc1,
1406  unsigned int size1,
1407 
1408  const NumericT * vec2,
1409  unsigned int start2,
1410  unsigned int inc2,
1411  unsigned int size2)
1412 {
1413  NumericT alpha = val;
1414  if (options2 & (1 << 0))
1415  alpha = -alpha;
1416  if (options2 & (1 << 1))
1417  alpha = NumericT(1) / alpha;
1418 
1419  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1420  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1421 
1422  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1423  {
1424  NumericT tmp = alpha * vec1[row * inc1 + start1];
1425  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1426  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1427  }
1428 }
1429 
1430 
1431 // alpha on GPU
1432 template<typename NumericT>
1434  NumericT * A,
1435  unsigned int A_start1, unsigned int A_start2,
1436  unsigned int A_inc1, unsigned int A_inc2,
1437  unsigned int A_size1, unsigned int A_size2,
1438  unsigned int A_internal_size1, unsigned int A_internal_size2,
1439 
1440  const NumericT * val,
1441  unsigned int options2,
1442 
1443  const NumericT * vec1,
1444  unsigned int start1,
1445  unsigned int inc1,
1446  unsigned int size1,
1447 
1448  const NumericT * vec2,
1449  unsigned int start2,
1450  unsigned int inc2,
1451  unsigned int size2)
1452 {
1453  NumericT alpha = *val;
1454  if (options2 & (1 << 0))
1455  alpha = -alpha;
1456  if (options2 & (1 << 1))
1457  alpha = NumericT(1) / alpha;
1458 
1459  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1460  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1461 
1462  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1463  {
1464  NumericT tmp = alpha * vec1[row * inc1 + start1];
1465  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1466  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1467  }
1468 }
1469 
1470 
1471 
1472 } // namespace cuda
1473 } //namespace linalg
1474 } //namespace viennacl
1475 
1476 
1477 #endif
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__device__ unsigned int row_major_index(unsigned int row, unsigned int col, unsigned int, unsigned int num_cols)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__device__ unsigned int col_major_index(unsigned int row, unsigned int col, unsigned int num_rows, unsigned int)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)