1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
28 #if defined VIENNACL_WITH_COMPLEX
33 #if defined VIENNACL_WITH_SSE3
34 #include <pmmintrin.h>
35 #elif defined VIENNACL_WITH_SSE2
36 #include <emmintrin.h>
124 return std::abs(x[0]);
126 T scaledSquareSum(1);
129 T absXi=std::abs(x[i]);
130 if (std::abs(x[i])>std::abs(scale)){
132 scaledSquareSum=T(1)+scaledSquareSum*temp*temp;
137 scaledSquareSum+=temp*temp;
141 return scale*sqrt(scaledSquareSum);
144 #if defined VIENNACL_WITH_COMPLEX
148 template<>
inline std::complex<double>
conjIfComplex(std::complex<double> x){
return conj(x);}
149 template<>
inline std::complex<float >
conjIfComplex(std::complex<float > x){
return conj(x);}
153 inline std::complex<double>
_nrm2(
const std::complex<double>* x,
vcl_size_t n)
158 return std::complex<double>(0);
160 return std::abs(x[0]);
162 double scaledSquareSum=1.0;
164 if (x[i].real()!=0.0){
165 double absXi=std::abs(x[i].real());
167 double temp=scale/absXi;
168 scaledSquareSum=1.0+scaledSquareSum*temp*temp;
172 double temp=absXi/scale;
173 scaledSquareSum+=temp*temp;
176 if (x[i].imag()!=0.0){
177 double absXi=std::abs(x[i].imag());
179 double temp=scale/absXi;
180 scaledSquareSum=1.0+scaledSquareSum*temp*temp;
184 double temp=absXi/scale;
185 scaledSquareSum+=temp*temp;
189 return std::complex<double>(scale*sqrt(scaledSquareSum));
193 inline std::complex<float>
_nrm2(
const std::complex<float>* x,
vcl_size_t n)
198 return std::complex<float>(0);
200 return std::abs(x[0]);
202 float scaledSquareSum=1.0;
204 if (x[i].real()!=0.0){
205 float absXi=std::abs(x[i].real());
207 float temp=scale/absXi;
208 scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
212 float temp=absXi/scale;
213 scaledSquareSum+=temp*temp;
216 if (x[i].imag()!=0.0){
217 float absXi=std::abs(x[i].imag());
219 float temp=scale/absXi;
220 scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
224 float temp=absXi/scale;
225 scaledSquareSum+=temp*temp;
229 return std::complex<float>(scale*sqrt(scaledSquareSum));
232 #endif //defined VIENNACL_COMPLEX
234 #if defined VIENNACL_WITH_SSE2
238 inline void _axpy<float>(
const float* x,
float* y,
vcl_size_t n,
float a)
260 __m128 reg0,reg1,reg2,reg3;
261 __m128 areg=_mm_set1_ps(a);
268 reg0=_mm_load_ps(x+0);
269 reg1=_mm_load_ps(x+4);
270 reg2=_mm_load_ps(y+0);
271 reg3=_mm_load_ps(y+4);
274 prod=_mm_mul_ps(reg0,areg);
275 sum0=_mm_add_ps(prod,reg2);
276 prod=_mm_mul_ps(reg1,areg);
277 sum1=_mm_add_ps(prod,reg3);
280 _mm_store_ps(y+0,sum0);
281 _mm_store_ps(y+4,sum1);
296 inline void _axpy<double>(
const double* x,
double* y,
vcl_size_t n,
double a)
318 __m128d reg0,reg1,reg2,reg3;
319 __m128d areg=_mm_set1_pd(a);
326 reg0=_mm_load_pd(x+0);
327 reg1=_mm_load_pd(x+2);
328 reg2=_mm_load_pd(y+0);
329 reg3=_mm_load_pd(y+2);
332 prod=_mm_mul_pd(reg0,areg);
333 sum0=_mm_add_pd(prod,reg2);
334 prod=_mm_mul_pd(reg1,areg);
335 sum1=_mm_add_pd(prod,reg3);
338 _mm_store_pd(y+0,sum0);
339 _mm_store_pd(y+2,sum1);
354 inline float _dot<float>(
vcl_size_t n,
const float* x,
const float* y)
380 __m128 sumReg=_mm_setzero_ps();
381 __m128 reg0,reg1,reg2,reg3;
387 reg0=_mm_load_ps(x+0);
388 reg1=_mm_load_ps(x+4);
389 reg2=_mm_load_ps(y+0);
390 reg3=_mm_load_ps(y+4);
393 reg0=_mm_mul_ps(reg0,reg2);
394 reg1=_mm_mul_ps(reg1,reg3);
397 sumReg=_mm_add_ps(sumReg,reg0);
398 sumReg=_mm_add_ps(sumReg,reg1);
411 float* pSums=(
float*)((((
vcl_size_t)sums)&(~15))+16);
412 _mm_store_ps(pSums,sumReg);
414 return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3];
420 inline double _dot(
vcl_size_t n,
const double* x,
const double* y)
444 __m128d sum0=_mm_setzero_pd();
445 __m128d sum1=_mm_setzero_pd();
446 __m128d reg0,reg1,reg2,reg3;
452 reg0=_mm_load_pd(x+0);
453 reg1=_mm_load_pd(x+2);
454 reg2=_mm_load_pd(y+0);
455 reg3=_mm_load_pd(y+2);
458 reg0=_mm_mul_pd(reg0,reg2);
459 reg1=_mm_mul_pd(reg1,reg3);
462 sum0=_mm_add_pd(sum0,reg0);
463 sum1=_mm_add_pd(sum1,reg1);
476 double* pSums=(
double*)((((
vcl_size_t)sums)&(~15))+16);
477 sum0=_mm_add_pd(sum0,sum1);
478 _mm_store_pd(pSums,sum0);
480 return sum+pSums[0]+pSums[1];
485 template<>
inline float _dotc<float >(
vcl_size_t n,
const float *x,
const float *y){
return _dot(n,x,y);}
486 template<>
inline double _dotc<double>(
vcl_size_t n,
const double *x,
const double *y){
return _dot(n,x,y);}
488 #if defined VIENNACL_WITH_COMPLEX
492 inline void _axpy<std::complex<float> >(
const std::complex<float>* x, std::complex<float>* y,
vcl_size_t n, std::complex<float> a)
512 __m128 reg0,reg1,reg2,reg3,reg4;
513 __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real());
514 __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag());
515 #ifndef VIENNACL_WITH_SSE3
516 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
523 reg0=_mm_load_ps((
float*)(x+0));
524 reg1=_mm_load_ps((
float*)(x+2));
525 reg2=_mm_load_ps((
float*)(y+0));
526 reg3=_mm_load_ps((
float*)(y+2));
529 #ifndef VIENNACL_WITH_SSE3
530 reg4=_mm_shuffle_ps(reg0,reg0,0xA0);
531 reg0=_mm_shuffle_ps(reg0,reg0,0xF5);
532 reg4=_mm_mul_ps(reg4,areg0);
533 reg0=_mm_mul_ps(reg0,areg1);
534 reg0=_mm_mul_ps(reg0,nreg);
535 reg0=_mm_add_ps(reg4,reg0);
536 reg0=_mm_add_ps(reg0,reg2);
537 reg4=_mm_shuffle_ps(reg1,reg1,0xA0);
538 reg1=_mm_shuffle_ps(reg1,reg1,0xF5);
539 reg4=_mm_mul_ps(reg4,areg0);
540 reg1=_mm_mul_ps(reg1,areg1);
541 reg1=_mm_mul_ps(reg1,nreg);
542 reg1=_mm_add_ps(reg4,reg1);
543 reg1=_mm_add_ps(reg1,reg3);
545 reg4=_mm_moveldup_ps(reg0);
546 reg0=_mm_movehdup_ps(reg0);
547 reg4=_mm_mul_ps(reg4,areg0);
548 reg0=_mm_mul_ps(reg0,areg1);
549 reg0=_mm_addsub_ps(reg4,reg0);
550 reg0=_mm_add_ps(reg0,reg2);
551 reg4=_mm_moveldup_ps(reg1);
552 reg1=_mm_movehdup_ps(reg1);
553 reg4=_mm_mul_ps(reg4,areg0);
554 reg1=_mm_mul_ps(reg1,areg1);
555 reg1=_mm_addsub_ps(reg4,reg1);
556 reg1=_mm_add_ps(reg1,reg3);
559 _mm_store_ps((
float*)(y+0),reg0);
560 _mm_store_ps((
float*)(y+2),reg1);
575 inline void _axpy<std::complex<double> >(
const std::complex<double>* x, std::complex<double>* y,
vcl_size_t n, std::complex<double> a)
584 __m128d reg0,reg1,reg2,reg3,reg4;
585 __m128d areg0=_mm_set_pd(a.imag(),a.real());
586 __m128d areg1=_mm_set_pd(a.real(),a.imag());
587 #ifndef VIENNACL_WITH_SSE3
588 __m128d nreg=_mm_set_pd(1.0,-1.0);
595 reg0=_mm_load_pd((
double*)(x+0));
596 reg1=_mm_load_pd((
double*)(x+1));
597 reg2=_mm_load_pd((
double*)(y+0));
598 reg3=_mm_load_pd((
double*)(y+1));
601 #ifndef VIENNACL_WITH_SSE3
602 reg4=_mm_shuffle_pd(reg0,reg0,0x0);
603 reg0=_mm_shuffle_pd(reg0,reg0,0x3);
604 reg4=_mm_mul_pd(reg4,areg0);
605 reg0=_mm_mul_pd(reg0,areg1);
606 reg0=_mm_mul_pd(reg0,nreg);
607 reg0=_mm_add_pd(reg4,reg0);
608 reg0=_mm_add_pd(reg0,reg2);
609 reg4=_mm_shuffle_pd(reg1,reg1,0x0);
610 reg1=_mm_shuffle_pd(reg1,reg1,0x3);
611 reg4=_mm_mul_pd(reg4,areg0);
612 reg1=_mm_mul_pd(reg1,areg1);
613 reg1=_mm_mul_pd(reg1,nreg);
614 reg1=_mm_add_pd(reg4,reg1);
615 reg1=_mm_add_pd(reg1,reg3);
617 reg4=_mm_shuffle_pd(reg0,reg0,0x0);
618 reg0=_mm_shuffle_pd(reg0,reg0,0x3);
619 reg4=_mm_mul_pd(reg4,areg0);
620 reg0=_mm_mul_pd(reg0,areg1);
621 reg0=_mm_addsub_pd(reg4,reg0);
622 reg0=_mm_add_pd(reg0,reg2);
623 reg4=_mm_shuffle_pd(reg1,reg1,0x0);
624 reg1=_mm_shuffle_pd(reg1,reg1,0x3);
625 reg4=_mm_mul_pd(reg4,areg0);
626 reg1=_mm_mul_pd(reg1,areg1);
627 reg1=_mm_addsub_pd(reg4,reg1);
628 reg1=_mm_add_pd(reg1,reg3);
631 _mm_store_pd((
double*)(y+0),reg0);
632 _mm_store_pd((
double*)(y+1),reg1);
647 inline std::complex<float> _dot<std::complex<float> >(
vcl_size_t n,
const std::complex<float>* x,
const std::complex<float>* y)
652 std::complex<float>
sum(0);
660 std::complex<float>
sum(0);
671 __m128 sumReg=_mm_setzero_ps();
672 __m128 reg0,reg1,reg2,reg3,reg4;
673 #ifndef VIENNACL_WITH_SSE3
674 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
681 reg0=_mm_load_ps((
float*)(x+0));
682 reg1=_mm_load_ps((
float*)(x+2));
683 reg2=_mm_load_ps((
float*)(y+0));
684 reg3=_mm_load_ps((
float*)(y+2));
687 #ifndef VIENNACL_WITH_SSE3
688 reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
689 reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
690 reg4=_mm_mul_ps(reg4,reg0);
691 reg2=_mm_mul_ps(reg2,reg0);
692 reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
693 reg2=_mm_mul_ps(reg2,nreg);
694 reg0=_mm_add_ps(reg4,reg2);
695 reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
696 reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
697 reg4=_mm_mul_ps(reg4,reg1);
698 reg3=_mm_mul_ps(reg3,reg1);
699 reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
700 reg3=_mm_mul_ps(reg3,nreg);
701 reg1=_mm_add_ps(reg4,reg3);
703 reg4=_mm_moveldup_ps(reg2);
704 reg2=_mm_movehdup_ps(reg2);
705 reg4=_mm_mul_ps(reg4,reg0);
706 reg2=_mm_mul_ps(reg2,reg0);
707 reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
708 reg0=_mm_addsub_ps(reg4,reg2);
709 reg4=_mm_moveldup_ps(reg3);
710 reg3=_mm_movehdup_ps(reg3);
711 reg4=_mm_mul_ps(reg4,reg1);
712 reg3=_mm_mul_ps(reg3,reg1);
713 reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
714 reg1=_mm_addsub_ps(reg4,reg3);
718 sumReg=_mm_add_ps(sumReg,reg0);
719 sumReg=_mm_add_ps(sumReg,reg1);
731 std::complex<float> sums[4];
732 std::complex<float>* pSums=(std::complex<float>*)((((
vcl_size_t)sums)&(~15))+16);
733 pSums[0]=std::complex<float>(0);
734 pSums[1]=std::complex<float>(0);
735 _mm_store_ps((
float*)pSums,sumReg);
737 return sum+pSums[0]+pSums[1];
743 inline std::complex<double> _dot<std::complex<double> >(
vcl_size_t n,
const std::complex<double>* x,
const std::complex<double>* y)
748 std::complex<double>
sum(0);
755 __m128d sumReg=_mm_setzero_pd();
756 __m128d reg0,reg1,reg2,reg3,reg4;
757 #ifndef VIENNACL_WITH_SSE3
758 __m128d nreg=_mm_set_pd(1.0,-1.0);
765 reg0=_mm_load_pd((
double*)(x+0));
766 reg1=_mm_load_pd((
double*)(x+1));
767 reg2=_mm_load_pd((
double*)(y+0));
768 reg3=_mm_load_pd((
double*)(y+1));
771 #ifndef VIENNACL_WITH_SSE3
772 reg4=_mm_shuffle_pd(reg2,reg2,0x0);
773 reg2=_mm_shuffle_pd(reg2,reg2,0x3);
774 reg4=_mm_mul_pd(reg4,reg0);
775 reg2=_mm_mul_pd(reg2,reg0);
776 reg2=_mm_shuffle_pd(reg2,reg2,0x1);
777 reg2=_mm_mul_pd(reg2,nreg);
778 reg0=_mm_add_pd(reg4,reg2);
779 reg4=_mm_shuffle_pd(reg3,reg3,0x0);
780 reg3=_mm_shuffle_pd(reg3,reg3,0x3);
781 reg4=_mm_mul_pd(reg4,reg1);
782 reg3=_mm_mul_pd(reg3,reg1);
783 reg3=_mm_shuffle_pd(reg3,reg3,0x1);
784 reg3=_mm_mul_pd(reg3,nreg);
785 reg1=_mm_add_pd(reg4,reg3);
787 reg4=_mm_shuffle_pd(reg2,reg2,0x0);
788 reg2=_mm_shuffle_pd(reg2,reg2,0x3);
789 reg4=_mm_mul_pd(reg4,reg0);
790 reg2=_mm_mul_pd(reg2,reg0);
791 reg2=_mm_shuffle_pd(reg2,reg2,0x1);
792 reg0=_mm_addsub_pd(reg4,reg2);
793 reg4=_mm_shuffle_pd(reg3,reg3,0x0);
794 reg3=_mm_shuffle_pd(reg3,reg3,0x3);
795 reg4=_mm_mul_pd(reg4,reg1);
796 reg3=_mm_mul_pd(reg3,reg1);
797 reg3=_mm_shuffle_pd(reg3,reg3,0x1);
798 reg1=_mm_addsub_pd(reg4,reg3);
802 sumReg=_mm_add_pd(sumReg,reg0);
803 sumReg=_mm_add_pd(sumReg,reg1);
811 std::complex<double>
sum(0);
816 std::complex<double> sums[2];
817 std::complex<double>* pSums=(std::complex<double>*)((((
vcl_size_t)sums)&(~15))+16);
818 pSums[0]=std::complex<double>(0);
819 _mm_store_pd((
double*)pSums,sumReg);
827 inline std::complex<float> _dotc<std::complex<float> >(
vcl_size_t n,
const std::complex<float>* x,
const std::complex<float>* y)
832 std::complex<float>
sum(0);
834 sum+=conj(x[i])*y[i];
840 std::complex<float>
sum(0);
845 sum+=conj(x[0])*y[0];
851 __m128 sumReg=_mm_setzero_ps();
852 __m128 reg0,reg1,reg2,reg3,reg4;
853 #ifndef VIENNACL_WITH_SSE3
854 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
861 reg0=_mm_load_ps((
float*)(x+0));
862 reg1=_mm_load_ps((
float*)(x+2));
863 reg2=_mm_load_ps((
float*)(y+0));
864 reg3=_mm_load_ps((
float*)(y+2));
867 #ifndef VIENNACL_WITH_SSE3
868 reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
869 reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
870 reg4=_mm_mul_ps(reg4,reg0);
871 reg2=_mm_mul_ps(reg2,reg0);
872 reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
873 reg4=_mm_mul_ps(reg4,nreg);
874 reg0=_mm_add_ps(reg4,reg2);
875 reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
876 reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
877 reg4=_mm_mul_ps(reg4,reg1);
878 reg3=_mm_mul_ps(reg3,reg1);
879 reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
880 reg4=_mm_mul_ps(reg4,nreg);
881 reg1=_mm_add_ps(reg4,reg3);
883 reg4=_mm_moveldup_ps(reg2);
884 reg2=_mm_movehdup_ps(reg2);
885 reg4=_mm_mul_ps(reg4,reg0);
886 reg2=_mm_mul_ps(reg2,reg0);
887 reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
888 reg0=_mm_addsub_ps(reg2,reg4);
889 reg4=_mm_moveldup_ps(reg3);
890 reg3=_mm_movehdup_ps(reg3);
891 reg4=_mm_mul_ps(reg4,reg1);
892 reg3=_mm_mul_ps(reg3,reg1);
893 reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
894 reg1=_mm_addsub_ps(reg3,reg4);
898 sumReg=_mm_add_ps(sumReg,reg0);
899 sumReg=_mm_add_ps(sumReg,reg1);
908 sum+=conj(x[i])*y[i];
911 std::complex<float> sums[4];
912 std::complex<float>* pSums=(std::complex<float>*)((((
vcl_size_t)sums)&(~15))+16);
913 sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);
914 _mm_store_ps((
float*)pSums,sumReg);
916 return sum+pSums[0]+pSums[1];
922 inline std::complex<double> _dotc<std::complex<double> >(
vcl_size_t n,
const std::complex<double>* x,
const std::complex<double>* y)
927 std::complex<double>
sum(0);
929 sum+=conj(x[i])*y[i];
934 __m128d sumReg=_mm_setzero_pd();
935 __m128d reg0,reg1,reg2,reg3,reg4;
936 #ifndef VIENNACL_WITH_SSE3
937 __m128d nreg=_mm_set_pd(1.0,-1.0);
944 reg0=_mm_load_pd((
double*)(x+0));
945 reg1=_mm_load_pd((
double*)(x+1));
946 reg2=_mm_load_pd((
double*)(y+0));
947 reg3=_mm_load_pd((
double*)(y+1));
950 #ifndef VIENNACL_WITH_SSE3
951 reg4=_mm_shuffle_pd(reg2,reg2,0x0);
952 reg2=_mm_shuffle_pd(reg2,reg2,0x3);
953 reg4=_mm_mul_pd(reg4,reg0);
954 reg2=_mm_mul_pd(reg2,reg0);
955 reg4=_mm_shuffle_pd(reg4,reg4,0x1);
956 reg4=_mm_mul_pd(reg4,nreg);
957 reg0=_mm_add_pd(reg4,reg2);
958 reg4=_mm_shuffle_pd(reg3,reg3,0x0);
959 reg3=_mm_shuffle_pd(reg3,reg3,0x3);
960 reg4=_mm_mul_pd(reg4,reg1);
961 reg3=_mm_mul_pd(reg3,reg1);
962 reg4=_mm_shuffle_pd(reg4,reg4,0x1);
963 reg4=_mm_mul_pd(reg4,nreg);
964 reg1=_mm_add_pd(reg4,reg3);
966 reg4=_mm_shuffle_pd(reg2,reg2,0x0);
967 reg2=_mm_shuffle_pd(reg2,reg2,0x3);
968 reg4=_mm_mul_pd(reg4,reg0);
969 reg2=_mm_mul_pd(reg2,reg0);
970 reg4=_mm_shuffle_pd(reg4,reg4,0x1);
971 reg0=_mm_addsub_pd(reg2,reg4);
972 reg4=_mm_shuffle_pd(reg3,reg3,0x0);
973 reg3=_mm_shuffle_pd(reg3,reg3,0x3);
974 reg4=_mm_mul_pd(reg4,reg1);
975 reg3=_mm_mul_pd(reg3,reg1);
976 reg4=_mm_shuffle_pd(reg4,reg4,0x1);
977 reg1=_mm_addsub_pd(reg3,reg4);
982 sumReg=_mm_add_pd(sumReg,reg0);
983 sumReg=_mm_add_pd(sumReg,reg1);
991 std::complex<double>
sum(0);
996 std::complex<double> sums[2];
997 std::complex<double>* pSums=(std::complex<double>*)((((
vcl_size_t)sums)&(~15))+16);
998 sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);
999 _mm_store_pd((
double*)pSums,sumReg);
1001 return sum+pSums[0];
1005 #endif //defined VIENNACL_WITH_COMPLEX
1007 #endif //defined VIENNACL_WITH_SSE2
void _axpy(const T *, T *, vcl_size_t, T)
T _dot(vcl_size_t, const T *, const T *)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
void _swap(vcl_size_t, T *, T *)
T _nrm2(const T *, vcl_size_t)
T _dotc(vcl_size_t, const T *, const T *)
void _copy(vcl_size_t, T *, T *)