1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
79 inline __host__ __device__ float2
operator+(float2 a, float2 b)
81 return make_float2(a.x + b.x, a.y + b.y);
85 inline __host__ __device__ float2
operator-(float2 a, float2 b)
87 return make_float2(a.x - b.x, a.y - b.y);
90 template<
typename SCALARTYPE>
91 inline __device__ float2
operator/(float2 a,SCALARTYPE b)
93 return make_float2(a.x/b, a.y/b);
97 inline __device__ float2
operator*(float2 in1, float2 in2)
99 return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
103 inline __host__ __device__ double2
operator+(double2 a, double2 b)
105 return make_double2(a.x + b.x, a.y + b.y);
109 inline __host__ __device__ double2
operator-(double2 a, double2 b)
111 return make_double2(a.x - b.x, a.y - b.y);
115 template<
typename SCALARTYPE>
116 inline __host__ __device__ double2
operator/(double2 a,SCALARTYPE b)
118 return make_double2(a.x/b, a.y/b);
122 inline __host__ __device__ double2
operator*(double2 in1, double2 in2)
124 return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
129 v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
130 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
131 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
132 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
133 v = (v >> 16) | (v << 16);
134 v = v >> (32 - bit_size);
138 template<
typename Numeric2T,
typename NumericT>
140 const Numeric2T * input,
144 unsigned int batch_num,
149 const NumericT NUM_PI(3.14159265358979323846);
151 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
153 for (
unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
159 for (
unsigned int n = 0; n <
size; n++)
163 in = input[batch_id * stride + n];
165 in = input[n * stride + batch_id];
168 NumericT arg = sign * 2 * NUM_PI * k / size * n;
176 tmp.x = in.x * ex.x - in.y * ex.y;
177 tmp.y = in.x * ex.y + in.y * ex.x;
182 output[batch_id * stride + k] = f;
184 output[k * stride + batch_id] = f;
195 template<
typename NumericT,
unsigned int AlignmentV>
199 NumericT
sign = NumericT(-1),
204 fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
205 reinterpret_cast< numeric2_type *
>(detail::cuda_arg<NumericT>(out)),
206 static_cast<unsigned int>(
size),
207 static_cast<unsigned int>(stride),
208 static_cast<unsigned int>(batch_num),
210 static_cast<bool>(data_order));
220 template<
typename NumericT,
unsigned int AlignmentV>
224 NumericT
sign = NumericT(-1),
229 fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
230 reinterpret_cast< numeric2_type *
>(detail::cuda_arg<NumericT>(out)),
231 static_cast<unsigned int>(
size),
232 static_cast<unsigned int>(stride),
233 static_cast<unsigned int>(batch_num),
235 static_cast<bool>(data_order));
239 template<
typename NumericT>
241 unsigned int bit_size,
244 unsigned int batch_num,
248 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
249 unsigned int glb_sz = gridDim.x * blockDim.x;
251 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
253 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
261 NumericT tmp = input[batch_id * stride + i];
262 input[batch_id * stride + i] = input[batch_id * stride + v];
263 input[batch_id * stride + v] = tmp;
267 NumericT tmp = input[i * stride + batch_id];
268 input[i * stride + batch_id] = input[v * stride + batch_id];
269 input[v * stride + batch_id] = tmp;
280 template<
typename NumericT,
unsigned int AlignmentV>
287 fft_reorder<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
288 static_cast<unsigned int>(bits_datasize),
289 static_cast<unsigned int>(size),
290 static_cast<unsigned int>(
stride),
291 static_cast<unsigned int>(batch_num),
292 static_cast<bool>(data_order));
296 template<
typename Numeric2T,
typename NumericT>
298 unsigned int bit_size,
301 unsigned int batch_num,
305 __shared__ Numeric2T lcl_input[1024];
306 unsigned int grp_id = blockIdx.x;
307 unsigned int grp_num = gridDim.x;
309 unsigned int lcl_sz = blockDim.x;
310 unsigned int lcl_id = threadIdx.x;
311 const NumericT NUM_PI(3.14159265358979323846);
313 for (
unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
315 for (
unsigned int p = lcl_id; p <
size; p += lcl_sz)
319 lcl_input[v] = input[batch_id * stride + p];
321 lcl_input[v] = input[p * stride + batch_id];
327 for (
unsigned int s = 0; s < bit_size; s++)
329 unsigned int ss = 1 << s;
331 for (
unsigned int tid = lcl_id; tid <
size; tid += lcl_sz)
333 unsigned int group = (tid & (ss - 1));
334 unsigned int pos = ((tid >> s) << (s + 1)) + group;
336 Numeric2T in1 = lcl_input[pos];
337 Numeric2T in2 = lcl_input[pos + ss];
339 NumericT arg = group * sign * NUM_PI / ss;
348 tmp.x = in2.x * ex.x - in2.y * ex.y;
349 tmp.y = in2.x * ex.y + in2.y * ex.x;
351 lcl_input[pos + ss] = in1 - tmp;
352 lcl_input[pos] = in1 + tmp;
358 for (
unsigned int p = lcl_id; p <
size; p += lcl_sz)
361 input[batch_id * stride + p] = lcl_input[p];
363 input[p * stride + batch_id] = lcl_input[p];
369 template<
typename Numeric2T,
typename NumericT>
372 unsigned int bit_size,
375 unsigned int batch_num,
380 unsigned int ss = 1 << s;
381 unsigned int half_size = size >> 1;
384 const NumericT NUM_PI(3.14159265358979323846);
386 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
387 unsigned int glb_sz = gridDim.x * blockDim.x;
389 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
391 for (
unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
393 unsigned int group = (tid & (ss - 1));
394 unsigned int pos = ((tid >> s) << (s + 1)) + group;
400 offset = batch_id * stride + pos;
402 in2 = input[offset + ss];
406 offset = pos * stride + batch_id;
408 in2 = input[offset + ss *
stride];
411 NumericT arg = group * sign * NUM_PI / ss;
421 tmp.x = in2.x * ex.x - in2.y * ex.y;
422 tmp.y = in2.x * ex.y + in2.y * ex.x;
425 input[offset + ss] = in1 - tmp;
427 input[offset + ss *
stride] = in1 - tmp;
428 input[offset] = in1 + tmp;
440 template<
typename NumericT,
unsigned int AlignmentV>
451 fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
452 static_cast<unsigned int>(bit_size),
453 static_cast<unsigned int>(size),
454 static_cast<unsigned int>(
stride),
455 static_cast<unsigned int>(batch_num),
456 static_cast<NumericT
>(
sign),
457 static_cast<bool>(data_order));
462 fft_reorder<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
463 static_cast<unsigned int>(bit_size),
464 static_cast<unsigned int>(size),
465 static_cast<unsigned int>(
stride),
466 static_cast<unsigned int>(batch_num),
467 static_cast<bool>(data_order));
472 fft_radix2<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
473 static_cast<unsigned int>(
step),
474 static_cast<unsigned int>(bit_size),
475 static_cast<unsigned int>(
size),
476 static_cast<unsigned int>(stride),
477 static_cast<unsigned int>(batch_num),
479 static_cast<bool>(data_order));
492 template<
typename NumericT,
unsigned int AlignmentV>
503 fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
504 static_cast<unsigned int>(bit_size),
505 static_cast<unsigned int>(size),
506 static_cast<unsigned int>(
stride),
507 static_cast<unsigned int>(batch_num),
509 static_cast<bool>(data_order));
514 fft_reorder<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
515 static_cast<unsigned int>(bit_size),
516 static_cast<unsigned int>(size),
517 static_cast<unsigned int>(
stride),
518 static_cast<unsigned int>(batch_num),
519 static_cast<bool>(data_order));
523 fft_radix2<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
524 static_cast<unsigned int>(
step),
525 static_cast<unsigned int>(bit_size),
526 static_cast<unsigned int>(
size),
527 static_cast<unsigned int>(stride),
528 static_cast<unsigned int>(batch_num),
530 static_cast<bool>(data_order));
536 template<
typename Numeric2T,
typename NumericT>
539 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
540 unsigned int glb_sz =gridDim.x * blockDim.x;
542 unsigned int double_size = size << 1;
544 const NumericT NUM_PI(3.14159265358979323846);
546 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
548 unsigned int rm = i * i % (double_size);
549 NumericT angle = (NumericT)rm / size * (-NUM_PI);
557 out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
558 out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
562 template<
typename Numeric2T,
typename NumericT>
563 __global__
void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
564 unsigned int size,
unsigned int ext_size, NumericT
sign)
566 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
567 unsigned int glb_sz = gridDim.x * blockDim.x;
569 unsigned int double_size = size << 1;
572 const NumericT NUM_PI(3.14159265358979323846);
574 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
576 unsigned int rm = i * i % (double_size);
577 NumericT angle = (NumericT)rm / size * NUM_PI;
590 A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
591 A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
596 B[ext_size - i] = b_i;
600 template<
typename NumericT>
601 __global__
void zero2(NumericT * input1, NumericT * input2,
unsigned int size)
603 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
620 template<
typename NumericT,
unsigned int AlignmentV>
633 zero2<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(A)),
634 reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(B)),
635 static_cast<unsigned int>(ext_size));
638 bluestein_pre<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
639 reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(A)),
640 reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(B)),
641 static_cast<unsigned int>(
size),
642 static_cast<unsigned int>(ext_size),
648 bluestein_post<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(Z)),
649 reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(out)),
650 static_cast<unsigned int>(
size),
655 template<
typename NumericT>
657 const NumericT * input2,
661 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
663 NumericT in1 = input1[i];
664 NumericT in2 = input2[i];
665 output[i] = in1 * in2;
672 template<
typename NumericT,
unsigned int AlignmentV>
681 fft_mult_vec<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(input1)),
682 reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(input2)),
683 reinterpret_cast< numeric2_type *
>(detail::cuda_arg<NumericT>(output)),
684 static_cast<unsigned int>(
size));
688 template<
typename Numeric2T,
typename NumericT>
691 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
692 input1[i] = input1[i]/factor;
698 template<
typename NumericT,
unsigned int AlignmentV>
704 NumericT norm_factor =
static_cast<NumericT
>(
size);
705 fft_div_vec_scalar<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(input)),
706 static_cast<unsigned int>(
size),
711 template<
typename NumericT>
714 unsigned int row_num,
715 unsigned int col_num)
717 unsigned int size = row_num * col_num;
718 for (
unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
720 unsigned int row = i / col_num;
721 unsigned int col = i - row*col_num;
722 unsigned int new_pos = col * row_num +
row;
723 output[new_pos] = input[i];
730 template<
typename NumericT,
unsigned int AlignmentV>
736 transpose<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(input)),
737 reinterpret_cast< numeric2_type *
>(detail::cuda_arg<NumericT>(output)),
744 template<
typename NumericT>
747 unsigned int row_num,
748 unsigned int col_num)
750 unsigned int size = row_num * col_num;
751 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
753 unsigned int row = i / col_num;
754 unsigned int col = i - row*col_num;
755 unsigned int new_pos = col * row_num +
row;
758 NumericT val = input[i];
759 input[i] = input[new_pos];
760 input[new_pos] = val;
768 template<
typename NumericT,
unsigned int AlignmentV>
773 transpose_inplace<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(detail::cuda_arg<NumericT>(input)),
780 template<
typename RealT,
typename ComplexT>
783 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
795 template<
typename NumericT>
801 real_to_complex<<<128,128>>>(detail::cuda_arg<NumericT>(in),
802 reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(out)),
803 static_cast<unsigned int>(size));
807 template<
typename ComplexT,
typename RealT>
810 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
817 template<
typename NumericT>
823 complex_to_real<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(detail::cuda_arg<NumericT>(in)),
824 detail::cuda_arg<NumericT>(out),
825 static_cast<unsigned int>(size));
830 template<
typename NumericT>
833 for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
835 NumericT val1 = vec[i];
836 NumericT val2 = vec[size - i - 1];
838 vec[size - i - 1] = val1;
845 template<
typename NumericT>
849 reverse_inplace<<<128,128>>>(detail::cuda_arg<NumericT>(in), static_cast<unsigned int>(size));
Helper class for checking whether a matrix has a row-major layout.
Implementation of the dense matrix class.
__global__ void bluestein_post(Numeric2T *Z, Numeric2T *out, unsigned int size, NumericT sign)
__global__ void real_to_complex(const RealT *in, ComplexT *out, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
__global__ void fft_direct(const Numeric2T *input, Numeric2T *output, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
This file provides the forward declarations for the main types used within ViennaCL.
__global__ void fft_mult_vec(const NumericT *input1, const NumericT *input2, NumericT *output, unsigned int size)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__host__ __device__ float2 operator+(float2 a, float2 b)
const vcl_size_t MAX_LOCAL_POINTS_NUM
vcl_size_t num_bits(vcl_size_t size)
__global__ void transpose(const NumericT *input, NumericT *output, unsigned int row_num, unsigned int col_num)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation.
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation.
__global__ void fft_div_vec_scalar(Numeric2T *input1, unsigned int size, NumericT factor)
__global__ void fft_reorder(NumericT *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, bool is_row_major)
__device__ float2 operator*(float2 in1, float2 in2)
__device__ float2 operator/(float2 a, SCALARTYPE b)
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector on with his own size.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
__global__ void zero2(NumericT *input1, NumericT *input2, unsigned int size)
__global__ void fft_radix2(Numeric2T *input, unsigned int s, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
__device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
size_type size() const
Returns the length of the vector (cf. std::vector)
__global__ void bluestein_pre(Numeric2T *input, Numeric2T *A, Numeric2T *B, unsigned int size, unsigned int ext_size, NumericT sign)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
__global__ void transpose_inplace(NumericT *input, unsigned int row_num, unsigned int col_num)
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to oposite order and save it in input vector.
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Mutiply two complex vectors and store result in output.
__global__ void complex_to_real(const ComplexT *in, RealT *out, unsigned int size)
__host__ __device__ float2 operator-(float2 a, float2 b)
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation.
vcl_size_t next_power_2(vcl_size_t n)
__global__ void fft_radix2_local(Numeric2T *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
__global__ void reverse_inplace(NumericT *vec, uint size)
Implementation of the ViennaCL scalar class.
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...