GNU Radio 3.6.0 C++ API
volk_32f_x2_dot_prod_32f_a.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
00003 
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006 
00007 
00008 #ifdef LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00012 
00013   float dotProduct = 0;
00014   const float* aPtr = input;
00015   const float* bPtr=  taps;
00016   unsigned int number = 0;
00017 
00018   for(number = 0; number < num_points; number++){
00019     dotProduct += ((*aPtr++) * (*bPtr++));
00020   }
00021 
00022   *result = dotProduct;
00023 }
00024 
00025 #endif /*LV_HAVE_GENERIC*/
00026 
00027 
00028 #ifdef LV_HAVE_SSE
00029 
00030 
00031 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00032 
00033   unsigned int number = 0;
00034   const unsigned int quarterPoints = num_points / 4;
00035 
00036   float dotProduct = 0;
00037   const float* aPtr = input;
00038   const float* bPtr = taps;
00039 
00040   __m128 aVal, bVal, cVal;
00041 
00042   __m128 dotProdVal = _mm_setzero_ps();
00043 
00044   for(;number < quarterPoints; number++){
00045 
00046     aVal = _mm_load_ps(aPtr);
00047     bVal = _mm_load_ps(bPtr);
00048 
00049     cVal = _mm_mul_ps(aVal, bVal);
00050 
00051     dotProdVal = _mm_add_ps(cVal, dotProdVal);
00052 
00053     aPtr += 4;
00054     bPtr += 4;
00055   }
00056 
00057   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00058 
00059   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00060 
00061   dotProduct = dotProductVector[0];
00062   dotProduct += dotProductVector[1];
00063   dotProduct += dotProductVector[2];
00064   dotProduct += dotProductVector[3];
00065 
00066   number = quarterPoints * 4;
00067   for(;number < num_points; number++){
00068     dotProduct += ((*aPtr++) * (*bPtr++));
00069   }
00070 
00071   *result = dotProduct;
00072 
00073 }
00074 
00075 #endif /*LV_HAVE_SSE*/
00076 
00077 #ifdef LV_HAVE_SSE3
00078 
00079 #include <pmmintrin.h>
00080 
00081 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00082   unsigned int number = 0;
00083   const unsigned int quarterPoints = num_points / 4;
00084 
00085   float dotProduct = 0;
00086   const float* aPtr = input;
00087   const float* bPtr = taps;
00088 
00089   __m128 aVal, bVal, cVal;
00090 
00091   __m128 dotProdVal = _mm_setzero_ps();
00092 
00093   for(;number < quarterPoints; number++){
00094 
00095     aVal = _mm_load_ps(aPtr);
00096     bVal = _mm_load_ps(bPtr);
00097 
00098     cVal = _mm_mul_ps(aVal, bVal);
00099 
00100     dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
00101 
00102     aPtr += 4;
00103     bPtr += 4;
00104   }
00105 
00106   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00107   dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
00108 
00109   _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
00110 
00111   dotProduct = dotProductVector[0];
00112   dotProduct += dotProductVector[1];
00113 
00114   number = quarterPoints * 4;
00115   for(;number < num_points; number++){
00116     dotProduct += ((*aPtr++) * (*bPtr++));
00117   }
00118 
00119   *result = dotProduct;
00120 }
00121 
00122 #endif /*LV_HAVE_SSE3*/
00123 
00124 #ifdef LV_HAVE_SSE4_1
00125 
00126 #include <smmintrin.h>
00127 
00128 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00129   unsigned int number = 0;
00130   const unsigned int sixteenthPoints = num_points / 16;
00131 
00132   float dotProduct = 0;
00133   const float* aPtr = input;
00134   const float* bPtr = taps;
00135 
00136   __m128 aVal1, bVal1, cVal1;
00137   __m128 aVal2, bVal2, cVal2;
00138   __m128 aVal3, bVal3, cVal3;
00139   __m128 aVal4, bVal4, cVal4;
00140 
00141   __m128 dotProdVal = _mm_setzero_ps();
00142 
00143   for(;number < sixteenthPoints; number++){
00144 
00145     aVal1 = _mm_load_ps(aPtr); aPtr += 4;
00146     aVal2 = _mm_load_ps(aPtr); aPtr += 4;
00147     aVal3 = _mm_load_ps(aPtr); aPtr += 4;
00148     aVal4 = _mm_load_ps(aPtr); aPtr += 4;
00149 
00150     bVal1 = _mm_load_ps(bPtr); bPtr += 4;
00151     bVal2 = _mm_load_ps(bPtr); bPtr += 4;
00152     bVal3 = _mm_load_ps(bPtr); bPtr += 4;
00153     bVal4 = _mm_load_ps(bPtr); bPtr += 4;
00154 
00155     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00156     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00157     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00158     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00159 
00160     cVal1 = _mm_or_ps(cVal1, cVal2);
00161     cVal3 = _mm_or_ps(cVal3, cVal4);
00162     cVal1 = _mm_or_ps(cVal1, cVal3);
00163 
00164     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00165   }
00166 
00167   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00168   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00169 
00170   dotProduct = dotProductVector[0];
00171   dotProduct += dotProductVector[1];
00172   dotProduct += dotProductVector[2];
00173   dotProduct += dotProductVector[3];
00174 
00175   number = sixteenthPoints * 16;
00176   for(;number < num_points; number++){
00177     dotProduct += ((*aPtr++) * (*bPtr++));
00178   }
00179 
00180   *result = dotProduct;
00181 }
00182 
00183 #endif /*LV_HAVE_SSE4_1*/
00184 
00185 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/