GNU Radio 3.6.2git-128-gdbc7a4c0 C++ API
volk_32f_x2_dot_prod_32f_u.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
00003 
00004 #include <volk/volk_common.h>
00005 #include<stdio.h>
00006 
00007 
00008 #ifdef LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
00012 
00013   float dotProduct = 0;
00014   const float* aPtr = input;
00015   const float* bPtr=  taps;
00016   unsigned int number = 0;
00017 
00018   for(number = 0; number < num_points; number++){
00019     dotProduct += ((*aPtr++) * (*bPtr++));
00020   }
00021 
00022   *result = dotProduct;
00023 }
00024 
00025 #endif /*LV_HAVE_GENERIC*/
00026 
00027 
00028 #ifdef LV_HAVE_SSE
00029 
00030 
00031 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00032 
00033   unsigned int number = 0;
00034   const unsigned int sixteenthPoints = num_points / 16;
00035 
00036   float dotProduct = 0;
00037   const float* aPtr = input;
00038   const float* bPtr = taps;
00039 
00040   __m128 a0Val, a1Val, a2Val, a3Val;
00041   __m128 b0Val, b1Val, b2Val, b3Val;
00042   __m128 c0Val, c1Val, c2Val, c3Val;
00043 
00044   __m128 dotProdVal0 = _mm_setzero_ps();
00045   __m128 dotProdVal1 = _mm_setzero_ps();
00046   __m128 dotProdVal2 = _mm_setzero_ps();
00047   __m128 dotProdVal3 = _mm_setzero_ps();
00048 
00049   for(;number < sixteenthPoints; number++){
00050 
00051     a0Val = _mm_load_ps(aPtr);
00052     a1Val = _mm_load_ps(aPtr+4);
00053     a2Val = _mm_load_ps(aPtr+8);
00054     a3Val = _mm_load_ps(aPtr+12);
00055     b0Val = _mm_load_ps(bPtr);
00056     b1Val = _mm_load_ps(bPtr+4);
00057     b2Val = _mm_load_ps(bPtr+8);
00058     b3Val = _mm_load_ps(bPtr+12);
00059 
00060     c0Val = _mm_mul_ps(a0Val, b0Val);
00061     c1Val = _mm_mul_ps(a1Val, b1Val);
00062     c2Val = _mm_mul_ps(a2Val, b2Val);
00063     c3Val = _mm_mul_ps(a3Val, b3Val);
00064 
00065     dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
00066     dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
00067     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
00068     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
00069 
00070     aPtr += 16;
00071     bPtr += 16;
00072   }
00073 
00074   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00075   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00076   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00077 
00078   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00079 
00080   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00081 
00082   dotProduct = dotProductVector[0];
00083   dotProduct += dotProductVector[1];
00084   dotProduct += dotProductVector[2];
00085   dotProduct += dotProductVector[3];
00086 
00087   number = sixteenthPoints*16;
00088   for(;number < num_points; number++){
00089     dotProduct += ((*aPtr++) * (*bPtr++));
00090     dotProduct += ((*aPtr++) * (*bPtr++));
00091     dotProduct += ((*aPtr++) * (*bPtr++));
00092     dotProduct += ((*aPtr++) * (*bPtr++));
00093   }
00094 
00095   *result = dotProduct;
00096 
00097 }
00098 
00099 #endif /*LV_HAVE_SSE*/
00100 
00101 #ifdef LV_HAVE_SSE3
00102 
00103 #include <pmmintrin.h>
00104 
00105 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
00106   unsigned int number = 0;
00107   const unsigned int sixteenthPoints = num_points / 16;
00108 
00109   float dotProduct = 0;
00110   const float* aPtr = input;
00111   const float* bPtr = taps;
00112 
00113   __m128 a0Val, a1Val, a2Val, a3Val;
00114   __m128 b0Val, b1Val, b2Val, b3Val;
00115   __m128 c0Val, c1Val, c2Val, c3Val;
00116 
00117   __m128 dotProdVal0 = _mm_setzero_ps();
00118   __m128 dotProdVal1 = _mm_setzero_ps();
00119   __m128 dotProdVal2 = _mm_setzero_ps();
00120   __m128 dotProdVal3 = _mm_setzero_ps();
00121 
00122   for(;number < sixteenthPoints; number++){
00123 
00124     a0Val = _mm_load_ps(aPtr);
00125     a1Val = _mm_load_ps(aPtr+4);
00126     a2Val = _mm_load_ps(aPtr+8);
00127     a3Val = _mm_load_ps(aPtr+12);
00128     b0Val = _mm_load_ps(bPtr);
00129     b1Val = _mm_load_ps(bPtr+4);
00130     b2Val = _mm_load_ps(bPtr+8);
00131     b3Val = _mm_load_ps(bPtr+12);
00132 
00133     c0Val = _mm_mul_ps(a0Val, b0Val);
00134     c1Val = _mm_mul_ps(a1Val, b1Val);
00135     c2Val = _mm_mul_ps(a2Val, b2Val);
00136     c3Val = _mm_mul_ps(a3Val, b3Val);
00137 
00138     dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
00139     dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
00140     dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
00141     dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
00142 
00143     aPtr += 16;
00144     bPtr += 16;
00145   }
00146 
00147   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
00148   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
00149   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
00150 
00151   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00152   _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00153 
00154   dotProduct = dotProductVector[0];
00155   dotProduct += dotProductVector[1];
00156   dotProduct += dotProductVector[2];
00157   dotProduct += dotProductVector[3];
00158 
00159   number = sixteenthPoints*16;
00160   for(;number < num_points; number++){
00161     dotProduct += ((*aPtr++) * (*bPtr++));
00162   }
00163 
00164   *result = dotProduct;
00165 }
00166 
00167 #endif /*LV_HAVE_SSE3*/
00168 
00169 #ifdef LV_HAVE_SSE4_1
00170 
00171 #include <smmintrin.h>
00172 
00173 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
00174   unsigned int number = 0;
00175   const unsigned int sixteenthPoints = num_points / 16;
00176 
00177   float dotProduct = 0;
00178   const float* aPtr = input;
00179   const float* bPtr = taps;
00180 
00181   __m128 aVal1, bVal1, cVal1;
00182   __m128 aVal2, bVal2, cVal2;
00183   __m128 aVal3, bVal3, cVal3;
00184   __m128 aVal4, bVal4, cVal4;
00185 
00186   __m128 dotProdVal = _mm_setzero_ps();
00187 
00188   for(;number < sixteenthPoints; number++){
00189 
00190     aVal1 = _mm_load_ps(aPtr); aPtr += 4;
00191     aVal2 = _mm_load_ps(aPtr); aPtr += 4;
00192     aVal3 = _mm_load_ps(aPtr); aPtr += 4;
00193     aVal4 = _mm_load_ps(aPtr); aPtr += 4;
00194 
00195     bVal1 = _mm_load_ps(bPtr); bPtr += 4;
00196     bVal2 = _mm_load_ps(bPtr); bPtr += 4;
00197     bVal3 = _mm_load_ps(bPtr); bPtr += 4;
00198     bVal4 = _mm_load_ps(bPtr); bPtr += 4;
00199 
00200     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
00201     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
00202     cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
00203     cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
00204 
00205     cVal1 = _mm_or_ps(cVal1, cVal2);
00206     cVal3 = _mm_or_ps(cVal3, cVal4);
00207     cVal1 = _mm_or_ps(cVal1, cVal3);
00208 
00209     dotProdVal = _mm_add_ps(dotProdVal, cVal1);
00210   }
00211 
00212   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
00213   _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
00214 
00215   dotProduct = dotProductVector[0];
00216   dotProduct += dotProductVector[1];
00217   dotProduct += dotProductVector[2];
00218   dotProduct += dotProductVector[3];
00219 
00220   number = sixteenthPoints * 16;
00221   for(;number < num_points; number++){
00222     dotProduct += ((*aPtr++) * (*bPtr++));
00223   }
00224 
00225   *result = dotProduct;
00226 }
00227 
00228 #endif /*LV_HAVE_SSE4_1*/
00229 
00230 #ifdef LV_HAVE_AVX
00231 
00232 #include <immintrin.h>
00233 
00234 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
00235 
00236   unsigned int number = 0;
00237   const unsigned int sixteenthPoints = num_points / 16;
00238 
00239   float dotProduct = 0;
00240   const float* aPtr = input;
00241   const float* bPtr = taps;
00242 
00243   __m256 a0Val, a1Val;
00244   __m256 b0Val, b1Val;
00245   __m256 c0Val, c1Val;
00246 
00247   __m256 dotProdVal0 = _mm256_setzero_ps();
00248   __m256 dotProdVal1 = _mm256_setzero_ps();
00249 
00250   for(;number < sixteenthPoints; number++){
00251 
00252     a0Val = _mm256_loadu_ps(aPtr);
00253     a1Val = _mm256_loadu_ps(aPtr+8);
00254     b0Val = _mm256_loadu_ps(bPtr);
00255     b1Val = _mm256_loadu_ps(bPtr+8);
00256 
00257     c0Val = _mm256_mul_ps(a0Val, b0Val);
00258     c1Val = _mm256_mul_ps(a1Val, b1Val);
00259 
00260     dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
00261     dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
00262 
00263     aPtr += 16;
00264     bPtr += 16;
00265   }
00266 
00267   dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
00268 
00269   __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
00270 
00271   _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
00272 
00273   dotProduct = dotProductVector[0];
00274   dotProduct += dotProductVector[1];
00275   dotProduct += dotProductVector[2];
00276   dotProduct += dotProductVector[3];
00277   dotProduct += dotProductVector[4];
00278   dotProduct += dotProductVector[5];
00279   dotProduct += dotProductVector[6];
00280   dotProduct += dotProductVector[7];
00281 
00282   number = sixteenthPoints*16;
00283   for(;number < num_points; number++){
00284     dotProduct += ((*aPtr++) * (*bPtr++));
00285   }
00286 
00287   *result = dotProduct;
00288 
00289 }
00290 
00291 #endif /*LV_HAVE_AVX*/
00292 
00293 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/