ViennaCL - The Vienna Computing Library  1.6.1
Free open-source GPU-accelerated linear algebra and solver library.
matrix_operations_row.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28 namespace linalg
29 {
30 namespace cuda
31 {
32 
33 //Matrix transpose kernel
34 template<typename NumericT>
35 __global__ void trans_kernel(
36  const NumericT * A,
37  unsigned int A_start1, unsigned int A_start2,
38  unsigned int A_internal_size1, unsigned int A_internal_size2,
39  unsigned int A_size1, unsigned int A_size2,
40  unsigned int A_stride1, unsigned int A_stride2,
41 
42  NumericT * B,
43  unsigned int B_start1, unsigned int B_start2,
44  unsigned int B_internal_size1, unsigned int B_internal_size2,
45  unsigned int B_stride1, unsigned int B_stride2,
46  bool data_major)
47 {
48  for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
49  {
50  for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
51  {
52  if(data_major)
53  B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
54  else
55  B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];
56  }
57  }
58 }
59 
60 //
61 // am
62 //
63 
64 // alpha on CPU
65 template<typename NumericT>
66 __global__ void am_row_kernel(
67  NumericT * A,
68  unsigned int A_start1, unsigned int A_start2,
69  unsigned int A_inc1, unsigned int A_inc2,
70  unsigned int A_size1, unsigned int A_size2,
71  unsigned int A_internal_size1, unsigned int A_internal_size2,
72 
73  NumericT fac2,
74  unsigned int options2,
75  const NumericT * B,
76  unsigned int B_start1, unsigned int B_start2,
77  unsigned int B_inc1, unsigned int B_inc2,
78  unsigned int B_internal_size1, unsigned int B_internal_size2)
79 {
80  NumericT alpha = fac2;
81  if (options2 & (1 << 0))
82  alpha = -alpha;
83 
84  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
85  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
86 
87  if (options2 & (1 << 1))
88  {
89  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
90  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
91  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
92  }
93  else
94  {
95  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
96  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
97  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
98  }
99 }
100 
101 // alpha on GPU
102 template<typename NumericT>
103 __global__ void am_row_kernel(
104  NumericT * A,
105  unsigned int A_start1, unsigned int A_start2,
106  unsigned int A_inc1, unsigned int A_inc2,
107  unsigned int A_size1, unsigned int A_size2,
108  unsigned int A_internal_size1, unsigned int A_internal_size2,
109 
110  const NumericT * fac2,
111  unsigned int options2,
112  const NumericT * B,
113  unsigned int B_start1, unsigned int B_start2,
114  unsigned int B_inc1, unsigned int B_inc2,
115  unsigned int B_internal_size1, unsigned int B_internal_size2)
116 {
117  NumericT alpha = *fac2;
118  if (options2 & (1 << 0))
119  alpha = -alpha;
120 
121  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
122  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
123 
124  if (options2 & (1 << 1))
125  {
126  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
127  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
128  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
129  }
130  else
131  {
132  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
133  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
134  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
135  }
136 }
137 
138 
139 //
140 // ambm
141 //
142 
143 // alpha and beta on CPU
144 template<typename NumericT>
145 __global__ void ambm_row_kernel(
146  NumericT * A,
147  unsigned int A_start1, unsigned int A_start2,
148  unsigned int A_inc1, unsigned int A_inc2,
149  unsigned int A_size1, unsigned int A_size2,
150  unsigned int A_internal_size1, unsigned int A_internal_size2,
151 
152  NumericT fac2,
153  unsigned int options2,
154  const NumericT * B,
155  unsigned int B_start1, unsigned int B_start2,
156  unsigned int B_inc1, unsigned int B_inc2,
157  unsigned int B_internal_size1, unsigned int B_internal_size2,
158 
159  NumericT fac3,
160  unsigned int options3,
161  const NumericT * C,
162  unsigned int C_start1, unsigned int C_start2,
163  unsigned int C_inc1, unsigned int C_inc2,
164  unsigned int C_internal_size1, unsigned int C_internal_size2)
165 {
166  NumericT alpha = fac2;
167  if (options2 & (1 << 0))
168  alpha = -alpha;
169 
170  NumericT beta = fac3;
171  if (options3 & (1 << 0))
172  beta = -beta;
173 
174  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
175  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
176 
177  if (options2 & (1 << 1))
178  {
179  if (options3 & (1 << 1))
180  {
181  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
182  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
183  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
184  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
185  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
186  }
187  else
188  {
189  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
190  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
191  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
192  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
193  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
194  }
195  }
196  else
197  {
198  if (options3 & (1 << 1))
199  {
200  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
201  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
202  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
203  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
204  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
205  }
206  else
207  {
208  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
209  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
210  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
211  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
212  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
213  }
214  }
215 }
216 
217 
218 // alpha on CPU, beta on GPU
219 template<typename NumericT>
220 __global__ void ambm_row_kernel(
221  NumericT * A,
222  unsigned int A_start1, unsigned int A_start2,
223  unsigned int A_inc1, unsigned int A_inc2,
224  unsigned int A_size1, unsigned int A_size2,
225  unsigned int A_internal_size1, unsigned int A_internal_size2,
226 
227  NumericT fac2,
228  unsigned int options2,
229  const NumericT * B,
230  unsigned int B_start1, unsigned int B_start2,
231  unsigned int B_inc1, unsigned int B_inc2,
232  unsigned int B_internal_size1, unsigned int B_internal_size2,
233 
234  const NumericT * fac3,
235  unsigned int options3,
236  const NumericT * C,
237  unsigned int C_start1, unsigned int C_start2,
238  unsigned int C_inc1, unsigned int C_inc2,
239  unsigned int C_internal_size1, unsigned int C_internal_size2)
240 {
241  NumericT alpha = fac2;
242  if (options2 & (1 << 0))
243  alpha = -alpha;
244 
245  NumericT beta = *fac3;
246  if (options3 & (1 << 0))
247  beta = -beta;
248 
249  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
250  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
251 
252  if (options2 & (1 << 1))
253  {
254  if (options3 & (1 << 1))
255  {
256  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
257  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
258  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
259  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
260  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
261  }
262  else
263  {
264  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
265  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
266  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
267  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
268  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
269  }
270  }
271  else
272  {
273  if (options3 & (1 << 1))
274  {
275  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
276  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
277  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
278  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
279  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
280  }
281  else
282  {
283  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
284  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
285  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
286  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
287  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
288  }
289  }
290 }
291 
292 // alpha on GPU, beta on CPU
293 template<typename NumericT>
294 __global__ void ambm_row_kernel(
295  NumericT * A,
296  unsigned int A_start1, unsigned int A_start2,
297  unsigned int A_inc1, unsigned int A_inc2,
298  unsigned int A_size1, unsigned int A_size2,
299  unsigned int A_internal_size1, unsigned int A_internal_size2,
300 
301  const NumericT * fac2,
302  unsigned int options2,
303  const NumericT * B,
304  unsigned int B_start1, unsigned int B_start2,
305  unsigned int B_inc1, unsigned int B_inc2,
306  unsigned int B_internal_size1, unsigned int B_internal_size2,
307 
308  NumericT fac3,
309  unsigned int options3,
310  const NumericT * C,
311  unsigned int C_start1, unsigned int C_start2,
312  unsigned int C_inc1, unsigned int C_inc2,
313  unsigned int C_internal_size1, unsigned int C_internal_size2)
314 {
315  NumericT alpha = *fac2;
316  if (options2 & (1 << 0))
317  alpha = -alpha;
318 
319  NumericT beta = fac3;
320  if (options3 & (1 << 0))
321  beta = -beta;
322 
323  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
324  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
325 
326  if (options2 & (1 << 1))
327  {
328  if (options3 & (1 << 1))
329  {
330  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
331  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
332  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
333  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
334  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
335  }
336  else
337  {
338  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
339  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
340  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
341  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
342  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
343  }
344  }
345  else
346  {
347  if (options3 & (1 << 1))
348  {
349  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
350  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
351  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
352  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
353  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
354  }
355  else
356  {
357  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
358  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
359  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
360  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
361  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
362  }
363  }
364 }
365 
366 
367 // alpha and beta on GPU
368 template<typename NumericT>
369 __global__ void ambm_row_kernel(
370  NumericT * A,
371  unsigned int A_start1, unsigned int A_start2,
372  unsigned int A_inc1, unsigned int A_inc2,
373  unsigned int A_size1, unsigned int A_size2,
374  unsigned int A_internal_size1, unsigned int A_internal_size2,
375 
376  const NumericT * fac2,
377  unsigned int options2,
378  const NumericT * B,
379  unsigned int B_start1, unsigned int B_start2,
380  unsigned int B_inc1, unsigned int B_inc2,
381  unsigned int B_internal_size1, unsigned int B_internal_size2,
382 
383  const NumericT * fac3,
384  unsigned int options3,
385  const NumericT * C,
386  unsigned int C_start1, unsigned int C_start2,
387  unsigned int C_inc1, unsigned int C_inc2,
388  unsigned int C_internal_size1, unsigned int C_internal_size2)
389 {
390  NumericT alpha = *fac2;
391  if (options2 & (1 << 0))
392  alpha = -alpha;
393 
394  NumericT beta = *fac3;
395  if (options3 & (1 << 0))
396  beta = -beta;
397 
398  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
399  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
400 
401  if (options2 & (1 << 1))
402  {
403  if (options3 & (1 << 1))
404  {
405  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
406  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
407  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
408  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
409  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
410  }
411  else
412  {
413  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
414  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
415  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
416  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
417  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
418  }
419  }
420  else
421  {
422  if (options3 & (1 << 1))
423  {
424  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
425  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
426  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
427  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
428  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
429  }
430  else
431  {
432  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
433  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
434  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
435  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
436  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
437  }
438  }
439 }
440 
441 
442 //
443 // ambm_m
444 //
445 
446 // alpha and beta on CPU
447 template<typename NumericT>
448 __global__ void ambm_m_row_kernel(
449  NumericT * A,
450  unsigned int A_start1, unsigned int A_start2,
451  unsigned int A_inc1, unsigned int A_inc2,
452  unsigned int A_size1, unsigned int A_size2,
453  unsigned int A_internal_size1, unsigned int A_internal_size2,
454 
455  NumericT fac2,
456  unsigned int options2,
457  const NumericT * B,
458  unsigned int B_start1, unsigned int B_start2,
459  unsigned int B_inc1, unsigned int B_inc2,
460  unsigned int B_internal_size1, unsigned int B_internal_size2,
461 
462  NumericT fac3,
463  unsigned int options3,
464  const NumericT * C,
465  unsigned int C_start1, unsigned int C_start2,
466  unsigned int C_inc1, unsigned int C_inc2,
467  unsigned int C_internal_size1, unsigned int C_internal_size2)
468 {
469  NumericT alpha = fac2;
470  if (options2 & (1 << 0))
471  alpha = -alpha;
472 
473  NumericT beta = fac3;
474  if (options3 & (1 << 0))
475  beta = -beta;
476 
477  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
478  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
479 
480  if (options2 & (1 << 1))
481  {
482  if (options3 & (1 << 1))
483  {
484  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
485  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
486  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
487  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
488  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
489  }
490  else
491  {
492  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
493  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
494  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
495  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
496  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
497  }
498  }
499  else
500  {
501  if (options3 & (1 << 1))
502  {
503  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
504  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
505  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
506  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
507  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
508  }
509  else
510  {
511  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
512  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
513  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
514  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
515  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
516  }
517  }
518 }
519 
520 
521 // alpha on CPU, beta on GPU
522 template<typename NumericT>
523 __global__ void ambm_m_row_kernel(
524  NumericT * A,
525  unsigned int A_start1, unsigned int A_start2,
526  unsigned int A_inc1, unsigned int A_inc2,
527  unsigned int A_size1, unsigned int A_size2,
528  unsigned int A_internal_size1, unsigned int A_internal_size2,
529 
530  NumericT fac2,
531  unsigned int options2,
532  const NumericT * B,
533  unsigned int B_start1, unsigned int B_start2,
534  unsigned int B_inc1, unsigned int B_inc2,
535  unsigned int B_internal_size1, unsigned int B_internal_size2,
536 
537  const NumericT * fac3,
538  unsigned int options3,
539  const NumericT * C,
540  unsigned int C_start1, unsigned int C_start2,
541  unsigned int C_inc1, unsigned int C_inc2,
542  unsigned int C_internal_size1, unsigned int C_internal_size2)
543 {
544  NumericT alpha = fac2;
545  if (options2 & (1 << 0))
546  alpha = -alpha;
547 
548  NumericT beta = *fac3;
549  if (options3 & (1 << 0))
550  beta = -beta;
551 
552  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
553  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
554 
555  if (options2 & (1 << 1))
556  {
557  if (options3 & (1 << 1))
558  {
559  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
560  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
561  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
562  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
563  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
564  }
565  else
566  {
567  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
568  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
569  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
570  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
571  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
572  }
573  }
574  else
575  {
576  if (options3 & (1 << 1))
577  {
578  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
579  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
580  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
581  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
582  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
583  }
584  else
585  {
586  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
587  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
588  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
589  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
590  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
591  }
592  }
593 }
594 
595 // alpha on GPU, beta on CPU
596 template<typename NumericT>
597 __global__ void ambm_m_row_kernel(
598  NumericT * A,
599  unsigned int A_start1, unsigned int A_start2,
600  unsigned int A_inc1, unsigned int A_inc2,
601  unsigned int A_size1, unsigned int A_size2,
602  unsigned int A_internal_size1, unsigned int A_internal_size2,
603 
604  const NumericT * fac2,
605  unsigned int options2,
606  const NumericT * B,
607  unsigned int B_start1, unsigned int B_start2,
608  unsigned int B_inc1, unsigned int B_inc2,
609  unsigned int B_internal_size1, unsigned int B_internal_size2,
610 
611  NumericT fac3,
612  unsigned int options3,
613  const NumericT * C,
614  unsigned int C_start1, unsigned int C_start2,
615  unsigned int C_inc1, unsigned int C_inc2,
616  unsigned int C_internal_size1, unsigned int C_internal_size2)
617 {
618  NumericT alpha = *fac2;
619  if (options2 & (1 << 0))
620  alpha = -alpha;
621 
622  NumericT beta = fac3;
623  if (options3 & (1 << 0))
624  beta = -beta;
625 
626  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
627  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
628 
629  if (options2 & (1 << 1))
630  {
631  if (options3 & (1 << 1))
632  {
633  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
634  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
635  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
636  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
637  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
638  }
639  else
640  {
641  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
642  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
643  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
644  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
645  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
646  }
647  }
648  else
649  {
650  if (options3 & (1 << 1))
651  {
652  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
653  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
654  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
655  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
656  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
657  }
658  else
659  {
660  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
661  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
662  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
663  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
664  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
665  }
666  }
667 }
668 
669 
670 // alpha and beta on GPU
671 template<typename NumericT>
672 __global__ void ambm_m_row_kernel(
673  NumericT * A,
674  unsigned int A_start1, unsigned int A_start2,
675  unsigned int A_inc1, unsigned int A_inc2,
676  unsigned int A_size1, unsigned int A_size2,
677  unsigned int A_internal_size1, unsigned int A_internal_size2,
678 
679  const NumericT * fac2,
680  unsigned int options2,
681  const NumericT * B,
682  unsigned int B_start1, unsigned int B_start2,
683  unsigned int B_inc1, unsigned int B_inc2,
684  unsigned int B_internal_size1, unsigned int B_internal_size2,
685 
686  const NumericT * fac3,
687  unsigned int options3,
688  const NumericT * C,
689  unsigned int C_start1, unsigned int C_start2,
690  unsigned int C_inc1, unsigned int C_inc2,
691  unsigned int C_internal_size1, unsigned int C_internal_size2)
692 {
693  NumericT alpha = *fac2;
694  if (options2 & (1 << 0))
695  alpha = -alpha;
696 
697  NumericT beta = *fac3;
698  if (options3 & (1 << 0))
699  beta = -beta;
700 
701  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
702  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
703 
704  if (options2 & (1 << 1))
705  {
706  if (options3 & (1 << 1))
707  {
708  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
709  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
710  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
711  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
712  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
713  }
714  else
715  {
716  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
717  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
718  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
719  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
720  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
721  }
722  }
723  else
724  {
725  if (options3 & (1 << 1))
726  {
727  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
728  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
729  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
730  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
731  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
732  }
733  else
734  {
735  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
736  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
737  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
738  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
739  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
740  }
741  }
742 }
743 
744 //
745 // assignments
746 //
747 
748 template<typename NumericT>
749 __global__ void matrix_row_assign_kernel(
750  NumericT * A,
751  unsigned int A_start1, unsigned int A_start2,
752  unsigned int A_inc1, unsigned int A_inc2,
753  unsigned int A_size1, unsigned int A_size2,
754  unsigned int A_internal_size1, unsigned int A_internal_size2,
755  NumericT alpha)
756 {
757  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
758  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
759 
760  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
761  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
762  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
763 }
764 
765 
766 template<typename NumericT>
768  NumericT * A,
769  unsigned int A_start1, unsigned int A_start2,
770  unsigned int A_inc1, unsigned int A_inc2,
771  unsigned int A_size1, unsigned int A_size2,
772  unsigned int A_internal_size1, unsigned int A_internal_size2,
773  NumericT alpha)
774 {
775  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
776 
777  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
778  A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
779 }
780 
781 //
782 // binary element-wise operations
783 //
784 
785 template<typename NumericT>
786 __global__ void element_op_row_kernel(
787  NumericT * A,
788  unsigned int A_start1, unsigned int A_start2,
789  unsigned int A_inc1, unsigned int A_inc2,
790  unsigned int A_size1, unsigned int A_size2,
791  unsigned int A_internal_size1, unsigned int A_internal_size2,
792 
793  const NumericT * B,
794  unsigned int B_start1, unsigned int B_start2,
795  unsigned int B_inc1, unsigned int B_inc2,
796  unsigned int B_internal_size1, unsigned int B_internal_size2,
797 
798  const NumericT * C,
799  unsigned int C_start1, unsigned int C_start2,
800  unsigned int C_inc1, unsigned int C_inc2,
801  unsigned int C_internal_size1, unsigned int C_internal_size2,
802 
803  unsigned int op_type) //0: product, 1: division, 2: pow
804 {
805  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
806  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
807 
808  if (op_type == 2)
809  {
810  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
811  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
812  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
813  = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
814  C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
815  }
816  else if (op_type == 1)
817  {
818  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
819  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
820  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
821  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
822  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
823  }
824  else if (op_type == 0)
825  {
826  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
827  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
828  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
829  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
830  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
831  }
832 }
833 
834 template<typename NumericT>
836  NumericT * A,
837  unsigned int A_start1, unsigned int A_start2,
838  unsigned int A_inc1, unsigned int A_inc2,
839  unsigned int A_size1, unsigned int A_size2,
840  unsigned int A_internal_size1, unsigned int A_internal_size2,
841 
842  const NumericT * B,
843  unsigned int B_start1, unsigned int B_start2,
844  unsigned int B_inc1, unsigned int B_inc2,
845  unsigned int B_internal_size1, unsigned int B_internal_size2,
846 
847  const NumericT * C,
848  unsigned int C_start1, unsigned int C_start2,
849  unsigned int C_inc1, unsigned int C_inc2,
850  unsigned int C_internal_size1, unsigned int C_internal_size2,
851 
852  unsigned int op_type) //0: product, 1: division, 2: pow
853 {
854  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
855  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
856 
857  if (op_type == 1)
858  {
859  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
860  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
861  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
862  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
863  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
864  }
865  else if (op_type == 0)
866  {
867  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
868  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
869  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
870  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
871  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
872  }
873 }
874 
875 //
876 // unary element-wise operations
877 //
878 
879 // abs
880 template<typename NumericT>
882  NumericT * A,
883  unsigned int A_start1, unsigned int A_start2,
884  unsigned int A_inc1, unsigned int A_inc2,
885  unsigned int A_size1, unsigned int A_size2,
886  unsigned int A_internal_size1, unsigned int A_internal_size2,
887 
888  const NumericT * B,
889  unsigned int B_start1, unsigned int B_start2,
890  unsigned int B_inc1, unsigned int B_inc2,
891  unsigned int B_internal_size1, unsigned int B_internal_size2)
892 {
893  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
894  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
895 
896  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
897  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
898  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
899 }
900 
901 
902 // acos
903 template<typename NumericT>
905  NumericT * A,
906  unsigned int A_start1, unsigned int A_start2,
907  unsigned int A_inc1, unsigned int A_inc2,
908  unsigned int A_size1, unsigned int A_size2,
909  unsigned int A_internal_size1, unsigned int A_internal_size2,
910 
911  const NumericT * B,
912  unsigned int B_start1, unsigned int B_start2,
913  unsigned int B_inc1, unsigned int B_inc2,
914  unsigned int B_internal_size1, unsigned int B_internal_size2)
915 {
916  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
917  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
918 
919  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
920  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
921  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
922 }
923 
924 
925 // asin
926 template<typename NumericT>
928  NumericT * A,
929  unsigned int A_start1, unsigned int A_start2,
930  unsigned int A_inc1, unsigned int A_inc2,
931  unsigned int A_size1, unsigned int A_size2,
932  unsigned int A_internal_size1, unsigned int A_internal_size2,
933 
934  const NumericT * B,
935  unsigned int B_start1, unsigned int B_start2,
936  unsigned int B_inc1, unsigned int B_inc2,
937  unsigned int B_internal_size1, unsigned int B_internal_size2)
938 {
939  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
940  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
941 
942  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
943  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
944  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
945 }
946 
947 
948 // atan
949 template<typename NumericT>
951  NumericT * A,
952  unsigned int A_start1, unsigned int A_start2,
953  unsigned int A_inc1, unsigned int A_inc2,
954  unsigned int A_size1, unsigned int A_size2,
955  unsigned int A_internal_size1, unsigned int A_internal_size2,
956 
957  const NumericT * B,
958  unsigned int B_start1, unsigned int B_start2,
959  unsigned int B_inc1, unsigned int B_inc2,
960  unsigned int B_internal_size1, unsigned int B_internal_size2)
961 {
962  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
963  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
964 
965  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
966  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
967  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
968 }
969 
970 
971 // ceil
972 template<typename NumericT>
974  NumericT * A,
975  unsigned int A_start1, unsigned int A_start2,
976  unsigned int A_inc1, unsigned int A_inc2,
977  unsigned int A_size1, unsigned int A_size2,
978  unsigned int A_internal_size1, unsigned int A_internal_size2,
979 
980  const NumericT * B,
981  unsigned int B_start1, unsigned int B_start2,
982  unsigned int B_inc1, unsigned int B_inc2,
983  unsigned int B_internal_size1, unsigned int B_internal_size2)
984 {
985  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
986  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
987 
988  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
989  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
990  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
991 }
992 
993 
994 // cos
995 template<typename NumericT>
997  NumericT * A,
998  unsigned int A_start1, unsigned int A_start2,
999  unsigned int A_inc1, unsigned int A_inc2,
1000  unsigned int A_size1, unsigned int A_size2,
1001  unsigned int A_internal_size1, unsigned int A_internal_size2,
1002 
1003  const NumericT * B,
1004  unsigned int B_start1, unsigned int B_start2,
1005  unsigned int B_inc1, unsigned int B_inc2,
1006  unsigned int B_internal_size1, unsigned int B_internal_size2)
1007 {
1008  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1009  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1010 
1011  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1012  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1013  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1014 }
1015 
1016 
1017 // cosh
1018 template<typename NumericT>
1020  NumericT * A,
1021  unsigned int A_start1, unsigned int A_start2,
1022  unsigned int A_inc1, unsigned int A_inc2,
1023  unsigned int A_size1, unsigned int A_size2,
1024  unsigned int A_internal_size1, unsigned int A_internal_size2,
1025 
1026  const NumericT * B,
1027  unsigned int B_start1, unsigned int B_start2,
1028  unsigned int B_inc1, unsigned int B_inc2,
1029  unsigned int B_internal_size1, unsigned int B_internal_size2)
1030 {
1031  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1032  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1033 
1034  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1035  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1036  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1037 }
1038 
1039 
1040 // exp
1041 template<typename NumericT>
1043  NumericT * A,
1044  unsigned int A_start1, unsigned int A_start2,
1045  unsigned int A_inc1, unsigned int A_inc2,
1046  unsigned int A_size1, unsigned int A_size2,
1047  unsigned int A_internal_size1, unsigned int A_internal_size2,
1048 
1049  const NumericT * B,
1050  unsigned int B_start1, unsigned int B_start2,
1051  unsigned int B_inc1, unsigned int B_inc2,
1052  unsigned int B_internal_size1, unsigned int B_internal_size2)
1053 {
1054  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1055  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1056 
1057  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1058  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1059  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1060 }
1061 
1062 
1063 // fabs
1064 template<typename NumericT>
1066  NumericT * A,
1067  unsigned int A_start1, unsigned int A_start2,
1068  unsigned int A_inc1, unsigned int A_inc2,
1069  unsigned int A_size1, unsigned int A_size2,
1070  unsigned int A_internal_size1, unsigned int A_internal_size2,
1071 
1072  const NumericT * B,
1073  unsigned int B_start1, unsigned int B_start2,
1074  unsigned int B_inc1, unsigned int B_inc2,
1075  unsigned int B_internal_size1, unsigned int B_internal_size2)
1076 {
1077  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1078  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1079 
1080  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1081  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1082  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1083 }
1084 
1085 
1086 // floor
1087 template<typename NumericT>
1089  NumericT * A,
1090  unsigned int A_start1, unsigned int A_start2,
1091  unsigned int A_inc1, unsigned int A_inc2,
1092  unsigned int A_size1, unsigned int A_size2,
1093  unsigned int A_internal_size1, unsigned int A_internal_size2,
1094 
1095  const NumericT * B,
1096  unsigned int B_start1, unsigned int B_start2,
1097  unsigned int B_inc1, unsigned int B_inc2,
1098  unsigned int B_internal_size1, unsigned int B_internal_size2)
1099 {
1100  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1101  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1102 
1103  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1104  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1105  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1106 }
1107 
1108 
1109 // log
1110 template<typename NumericT>
1112  NumericT * A,
1113  unsigned int A_start1, unsigned int A_start2,
1114  unsigned int A_inc1, unsigned int A_inc2,
1115  unsigned int A_size1, unsigned int A_size2,
1116  unsigned int A_internal_size1, unsigned int A_internal_size2,
1117 
1118  const NumericT * B,
1119  unsigned int B_start1, unsigned int B_start2,
1120  unsigned int B_inc1, unsigned int B_inc2,
1121  unsigned int B_internal_size1, unsigned int B_internal_size2)
1122 {
1123  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1124  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1125 
1126  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1127  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1128  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1129 }
1130 
1131 
1132 // log10
1133 template<typename NumericT>
1135  NumericT * A,
1136  unsigned int A_start1, unsigned int A_start2,
1137  unsigned int A_inc1, unsigned int A_inc2,
1138  unsigned int A_size1, unsigned int A_size2,
1139  unsigned int A_internal_size1, unsigned int A_internal_size2,
1140 
1141  const NumericT * B,
1142  unsigned int B_start1, unsigned int B_start2,
1143  unsigned int B_inc1, unsigned int B_inc2,
1144  unsigned int B_internal_size1, unsigned int B_internal_size2)
1145 {
1146  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1147  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1148 
1149  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1150  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1151  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1152 }
1153 
1154 
1155 // sin
1156 template<typename NumericT>
1158  NumericT * A,
1159  unsigned int A_start1, unsigned int A_start2,
1160  unsigned int A_inc1, unsigned int A_inc2,
1161  unsigned int A_size1, unsigned int A_size2,
1162  unsigned int A_internal_size1, unsigned int A_internal_size2,
1163 
1164  const NumericT * B,
1165  unsigned int B_start1, unsigned int B_start2,
1166  unsigned int B_inc1, unsigned int B_inc2,
1167  unsigned int B_internal_size1, unsigned int B_internal_size2)
1168 {
1169  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1170  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1171 
1172  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1173  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1174  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1175 }
1176 
1177 
1178 // sinh
1179 template<typename NumericT>
1181  NumericT * A,
1182  unsigned int A_start1, unsigned int A_start2,
1183  unsigned int A_inc1, unsigned int A_inc2,
1184  unsigned int A_size1, unsigned int A_size2,
1185  unsigned int A_internal_size1, unsigned int A_internal_size2,
1186 
1187  const NumericT * B,
1188  unsigned int B_start1, unsigned int B_start2,
1189  unsigned int B_inc1, unsigned int B_inc2,
1190  unsigned int B_internal_size1, unsigned int B_internal_size2)
1191 {
1192  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1193  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1194 
1195  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1196  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1197  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1198 }
1199 
1200 
1201 // sqrt
1202 template<typename NumericT>
1204  NumericT * A,
1205  unsigned int A_start1, unsigned int A_start2,
1206  unsigned int A_inc1, unsigned int A_inc2,
1207  unsigned int A_size1, unsigned int A_size2,
1208  unsigned int A_internal_size1, unsigned int A_internal_size2,
1209 
1210  const NumericT * B,
1211  unsigned int B_start1, unsigned int B_start2,
1212  unsigned int B_inc1, unsigned int B_inc2,
1213  unsigned int B_internal_size1, unsigned int B_internal_size2)
1214 {
1215  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1216  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1217 
1218  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1219  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1220  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1221 }
1222 
1223 
1224 // tan
1225 template<typename NumericT>
1227  NumericT * A,
1228  unsigned int A_start1, unsigned int A_start2,
1229  unsigned int A_inc1, unsigned int A_inc2,
1230  unsigned int A_size1, unsigned int A_size2,
1231  unsigned int A_internal_size1, unsigned int A_internal_size2,
1232 
1233  const NumericT * B,
1234  unsigned int B_start1, unsigned int B_start2,
1235  unsigned int B_inc1, unsigned int B_inc2,
1236  unsigned int B_internal_size1, unsigned int B_internal_size2)
1237 {
1238  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1239  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1240 
1241  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1242  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1243  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1244 }
1245 
1246 
1247 // tanh
1248 template<typename NumericT>
1250  NumericT * A,
1251  unsigned int A_start1, unsigned int A_start2,
1252  unsigned int A_inc1, unsigned int A_inc2,
1253  unsigned int A_size1, unsigned int A_size2,
1254  unsigned int A_internal_size1, unsigned int A_internal_size2,
1255 
1256  const NumericT * B,
1257  unsigned int B_start1, unsigned int B_start2,
1258  unsigned int B_inc1, unsigned int B_inc2,
1259  unsigned int B_internal_size1, unsigned int B_internal_size2)
1260 {
1261  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1262  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1263 
1264  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1265  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1266  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1267 }
1268 
1269 
1270 
1271 //
1272 // matrix-vector product
1273 //
1274 
1275 template<typename NumericT>
1276 __global__ void vec_mul_row_kernel(
1277  const NumericT * A,
1278  unsigned int A_row_start,
1279  unsigned int A_col_start,
1280  unsigned int A_row_inc,
1281  unsigned int A_col_inc,
1282  unsigned int A_row_size,
1283  unsigned int A_col_size,
1284  unsigned int A_internal_rows,
1285  unsigned int A_internal_cols,
1286  const NumericT * v,
1287  unsigned int v_start,
1288  unsigned int v_inc,
1289  unsigned int v_size,
1290  NumericT * result,
1291  unsigned int result_start,
1292  unsigned int result_inc,
1293  unsigned int result_size)
1294 {
1295  __shared__ NumericT work[128];
1296 
1297  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1298  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1299  unsigned int lid = threadIdx.x;
1300 
1301  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
1302  {
1303  NumericT dot_prod = 0;
1304  for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1305  dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1306  work[lid] = dot_prod;
1307 
1308  for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
1309  __syncthreads();
1310  if (lid < stride)
1311  work[lid] += work[lid+stride];
1312  }
1313 
1314  if (lid == 0)
1315  result[row * result_inc + result_start] = work[0];
1316  }
1317 }
1318 
1319 
1320 template<typename NumericT>
1322  const NumericT * A,
1323  unsigned int A_row_start,
1324  unsigned int A_col_start,
1325  unsigned int A_row_inc,
1326  unsigned int A_col_inc,
1327  unsigned int A_row_size,
1328  unsigned int A_col_size,
1329  unsigned int A_internal_rows,
1330  unsigned int A_internal_cols,
1331  const NumericT * v,
1332  unsigned int v_start,
1333  unsigned int v_inc,
1334  unsigned int v_size,
1335  NumericT * result,
1336  unsigned int result_start,
1337  unsigned int result_inc,
1338  unsigned int result_size)
1339 {
1340  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
1341  {
1342  NumericT dot_prod = 0;
1343  for (unsigned int col = 0; col < A_row_size; ++col)
1344  dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1345  result[row * result_inc + result_start] = dot_prod;
1346  }
1347 }
1348 
1349 
1350 //
1351 // matrix-matrix products
1352 //
1353 
1354 
1355 
1356 
1357 //
1358 // scaled rank-1-update
1359 //
1360 
1361 // alpha on CPU
1362 template<typename NumericT>
1364  NumericT * A,
1365  unsigned int A_start1, unsigned int A_start2,
1366  unsigned int A_inc1, unsigned int A_inc2,
1367  unsigned int A_size1, unsigned int A_size2,
1368  unsigned int A_internal_size1, unsigned int A_internal_size2,
1369 
1370  NumericT val,
1371  unsigned int options2,
1372 
1373  const NumericT * vec1,
1374  unsigned int start1,
1375  unsigned int inc1,
1376  unsigned int size1,
1377 
1378  const NumericT * vec2,
1379  unsigned int start2,
1380  unsigned int inc2,
1381  unsigned int size2)
1382 {
1383  NumericT alpha = val;
1384  if (options2 & (1 << 0))
1385  alpha = -alpha;
1386  if (options2 & (1 << 1))
1387  alpha = NumericT(1) / alpha;
1388 
1389  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1390  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1391 
1392  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1393  {
1394  NumericT tmp = alpha * vec1[row * inc1 + start1];
1395  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1396  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1397  }
1398 }
1399 
1400 
1401 // alpha on GPU
1402 template<typename NumericT>
1404  NumericT * A,
1405  unsigned int A_start1, unsigned int A_start2,
1406  unsigned int A_inc1, unsigned int A_inc2,
1407  unsigned int A_size1, unsigned int A_size2,
1408  unsigned int A_internal_size1, unsigned int A_internal_size2,
1409 
1410  const NumericT * val,
1411  unsigned int options2,
1412 
1413  const NumericT * vec1,
1414  unsigned int start1,
1415  unsigned int inc1,
1416  unsigned int size1,
1417 
1418  const NumericT * vec2,
1419  unsigned int start2,
1420  unsigned int inc2,
1421  unsigned int size2)
1422 {
1423  NumericT alpha = *val;
1424  if (options2 & (1 << 0))
1425  alpha = -alpha;
1426  if (options2 & (1 << 1))
1427  alpha = NumericT(1) / alpha;
1428 
1429  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1430  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1431 
1432  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1433  {
1434  NumericT tmp = alpha * vec1[row * inc1 + start1];
1435  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1436  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1437  }
1438 }
1439 
1440 
1441 
1442 } // namespace cuda
1443 } //namespace linalg
1444 } //namespace viennacl
1445 
1446 
1447 #endif
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)