GNU Radio 3.6.2git-128-gdbc7a4c0 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { 00012 00013 float dotProduct = 0; 00014 const float* aPtr = input; 00015 const float* bPtr= taps; 00016 unsigned int number = 0; 00017 00018 for(number = 0; number < num_points; number++){ 00019 dotProduct += ((*aPtr++) * (*bPtr++)); 00020 } 00021 00022 *result = dotProduct; 00023 } 00024 00025 #endif /*LV_HAVE_GENERIC*/ 00026 00027 00028 #ifdef LV_HAVE_SSE 00029 00030 00031 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { 00032 00033 unsigned int number = 0; 00034 const unsigned int sixteenthPoints = num_points / 16; 00035 00036 float dotProduct = 0; 00037 const float* aPtr = input; 00038 const float* bPtr = taps; 00039 00040 __m128 a0Val, a1Val, a2Val, a3Val; 00041 __m128 b0Val, b1Val, b2Val, b3Val; 00042 __m128 c0Val, c1Val, c2Val, c3Val; 00043 00044 __m128 dotProdVal0 = _mm_setzero_ps(); 00045 __m128 dotProdVal1 = _mm_setzero_ps(); 00046 __m128 dotProdVal2 = _mm_setzero_ps(); 00047 __m128 dotProdVal3 = _mm_setzero_ps(); 00048 00049 for(;number < sixteenthPoints; number++){ 00050 00051 a0Val = _mm_load_ps(aPtr); 00052 a1Val = _mm_load_ps(aPtr+4); 00053 a2Val = _mm_load_ps(aPtr+8); 00054 a3Val = _mm_load_ps(aPtr+12); 00055 b0Val = _mm_load_ps(bPtr); 00056 b1Val = _mm_load_ps(bPtr+4); 00057 b2Val = _mm_load_ps(bPtr+8); 00058 b3Val = _mm_load_ps(bPtr+12); 00059 00060 c0Val = _mm_mul_ps(a0Val, b0Val); 00061 c1Val = _mm_mul_ps(a1Val, b1Val); 00062 c2Val = _mm_mul_ps(a2Val, b2Val); 00063 c3Val = _mm_mul_ps(a3Val, b3Val); 00064 00065 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); 00066 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); 00067 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); 00068 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); 00069 00070 aPtr += 16; 00071 bPtr += 16; 00072 } 00073 00074 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00075 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00076 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00077 00078 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00079 00080 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00081 00082 dotProduct = dotProductVector[0]; 00083 dotProduct += dotProductVector[1]; 00084 dotProduct += dotProductVector[2]; 00085 dotProduct += dotProductVector[3]; 00086 00087 number = sixteenthPoints*16; 00088 for(;number < num_points; number++){ 00089 dotProduct += ((*aPtr++) * (*bPtr++)); 00090 dotProduct += ((*aPtr++) * (*bPtr++)); 00091 dotProduct += ((*aPtr++) * (*bPtr++)); 00092 dotProduct += ((*aPtr++) * (*bPtr++)); 00093 } 00094 00095 *result = dotProduct; 00096 00097 } 00098 00099 #endif /*LV_HAVE_SSE*/ 00100 00101 #ifdef LV_HAVE_SSE3 00102 00103 #include <pmmintrin.h> 00104 00105 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { 00106 unsigned int number = 0; 00107 const unsigned int sixteenthPoints = num_points / 16; 00108 00109 float dotProduct = 0; 00110 const float* aPtr = input; 00111 const float* bPtr = taps; 00112 00113 __m128 a0Val, a1Val, a2Val, a3Val; 00114 __m128 b0Val, b1Val, b2Val, b3Val; 00115 __m128 c0Val, c1Val, c2Val, c3Val; 00116 00117 __m128 dotProdVal0 = _mm_setzero_ps(); 00118 __m128 dotProdVal1 = _mm_setzero_ps(); 00119 __m128 dotProdVal2 = _mm_setzero_ps(); 00120 __m128 dotProdVal3 = _mm_setzero_ps(); 00121 00122 for(;number < sixteenthPoints; number++){ 00123 00124 a0Val = _mm_load_ps(aPtr); 00125 a1Val = _mm_load_ps(aPtr+4); 00126 a2Val = _mm_load_ps(aPtr+8); 00127 a3Val = _mm_load_ps(aPtr+12); 00128 b0Val = _mm_load_ps(bPtr); 00129 b1Val = _mm_load_ps(bPtr+4); 00130 b2Val = _mm_load_ps(bPtr+8); 00131 b3Val = _mm_load_ps(bPtr+12); 00132 00133 c0Val = _mm_mul_ps(a0Val, b0Val); 00134 c1Val = _mm_mul_ps(a1Val, b1Val); 00135 c2Val = _mm_mul_ps(a2Val, b2Val); 00136 c3Val = _mm_mul_ps(a3Val, b3Val); 00137 00138 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); 00139 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); 00140 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); 00141 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); 00142 00143 aPtr += 16; 00144 bPtr += 16; 00145 } 00146 00147 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); 00148 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); 00149 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); 00150 00151 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00152 _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00153 00154 dotProduct = dotProductVector[0]; 00155 dotProduct += dotProductVector[1]; 00156 dotProduct += dotProductVector[2]; 00157 dotProduct += dotProductVector[3]; 00158 00159 number = sixteenthPoints*16; 00160 for(;number < num_points; number++){ 00161 dotProduct += ((*aPtr++) * (*bPtr++)); 00162 } 00163 00164 *result = dotProduct; 00165 } 00166 00167 #endif /*LV_HAVE_SSE3*/ 00168 00169 #ifdef LV_HAVE_SSE4_1 00170 00171 #include <smmintrin.h> 00172 00173 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { 00174 unsigned int number = 0; 00175 const unsigned int sixteenthPoints = num_points / 16; 00176 00177 float dotProduct = 0; 00178 const float* aPtr = input; 00179 const float* bPtr = taps; 00180 00181 __m128 aVal1, bVal1, cVal1; 00182 __m128 aVal2, bVal2, cVal2; 00183 __m128 aVal3, bVal3, cVal3; 00184 __m128 aVal4, bVal4, cVal4; 00185 00186 __m128 dotProdVal = _mm_setzero_ps(); 00187 00188 for(;number < sixteenthPoints; number++){ 00189 00190 aVal1 = _mm_load_ps(aPtr); aPtr += 4; 00191 aVal2 = _mm_load_ps(aPtr); aPtr += 4; 00192 aVal3 = _mm_load_ps(aPtr); aPtr += 4; 00193 aVal4 = _mm_load_ps(aPtr); aPtr += 4; 00194 00195 bVal1 = _mm_load_ps(bPtr); bPtr += 4; 00196 bVal2 = _mm_load_ps(bPtr); bPtr += 4; 00197 bVal3 = _mm_load_ps(bPtr); bPtr += 4; 00198 bVal4 = _mm_load_ps(bPtr); bPtr += 4; 00199 00200 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); 00201 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); 00202 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); 00203 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); 00204 00205 cVal1 = _mm_or_ps(cVal1, cVal2); 00206 cVal3 = _mm_or_ps(cVal3, cVal4); 00207 cVal1 = _mm_or_ps(cVal1, cVal3); 00208 00209 dotProdVal = _mm_add_ps(dotProdVal, cVal1); 00210 } 00211 00212 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00213 _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector 00214 00215 dotProduct = dotProductVector[0]; 00216 dotProduct += dotProductVector[1]; 00217 dotProduct += dotProductVector[2]; 00218 dotProduct += dotProductVector[3]; 00219 00220 number = sixteenthPoints * 16; 00221 for(;number < num_points; number++){ 00222 dotProduct += ((*aPtr++) * (*bPtr++)); 00223 } 00224 00225 *result = dotProduct; 00226 } 00227 00228 #endif /*LV_HAVE_SSE4_1*/ 00229 00230 #ifdef LV_HAVE_AVX 00231 00232 #include <immintrin.h> 00233 00234 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { 00235 00236 unsigned int number = 0; 00237 const unsigned int sixteenthPoints = num_points / 16; 00238 00239 float dotProduct = 0; 00240 const float* aPtr = input; 00241 const float* bPtr = taps; 00242 00243 __m256 a0Val, a1Val; 00244 __m256 b0Val, b1Val; 00245 __m256 c0Val, c1Val; 00246 00247 __m256 dotProdVal0 = _mm256_setzero_ps(); 00248 __m256 dotProdVal1 = _mm256_setzero_ps(); 00249 00250 for(;number < sixteenthPoints; number++){ 00251 00252 a0Val = _mm256_loadu_ps(aPtr); 00253 a1Val = _mm256_loadu_ps(aPtr+8); 00254 b0Val = _mm256_loadu_ps(bPtr); 00255 b1Val = _mm256_loadu_ps(bPtr+8); 00256 00257 c0Val = _mm256_mul_ps(a0Val, b0Val); 00258 c1Val = _mm256_mul_ps(a1Val, b1Val); 00259 00260 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); 00261 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); 00262 00263 aPtr += 16; 00264 bPtr += 16; 00265 } 00266 00267 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); 00268 00269 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; 00270 00271 _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector 00272 00273 dotProduct = dotProductVector[0]; 00274 dotProduct += dotProductVector[1]; 00275 dotProduct += dotProductVector[2]; 00276 dotProduct += dotProductVector[3]; 00277 dotProduct += dotProductVector[4]; 00278 dotProduct += dotProductVector[5]; 00279 dotProduct += dotProductVector[6]; 00280 dotProduct += dotProductVector[7]; 00281 00282 number = sixteenthPoints*16; 00283 for(;number < num_points; number++){ 00284 dotProduct += ((*aPtr++) * (*bPtr++)); 00285 } 00286 00287 *result = dotProduct; 00288 00289 } 00290 00291 #endif /*LV_HAVE_AVX*/ 00292 00293 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/