GNU Radio 3.6.0 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H 00002 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H 00003 00004 #include <volk/volk_common.h> 00005 #include<stdio.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { 00012 00013 float dotProduct = 0; 00014 const float* aPtr = input; 00015 const float* bPtr= taps; 00016 unsigned int number = 0; 00017 00018 for(number = 0; number < num_points; number++){ 00019 dotProduct += ((*aPtr++) * (*bPtr++)); 00020 } 00021 00022 *result = dotProduct; 00023 } 00024 00025 #endif /*LV_HAVE_GENERIC*/ 00026 00027 00028 #ifdef LV_HAVE_SSE 00029 00030 00031 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { 00032 00033 unsigned int number = 0; 00034 const unsigned int quarterPoints = num_points / 4; 00035 00036 float dotProduct = 0; 00037 const float* aPtr = input; 00038 const float* bPtr = taps; 00039 00040 __m128 aVal, bVal, cVal; 00041 00042 __m128 dotProdVal = _mm_setzero_ps(); 00043 00044 for(;number < quarterPoints; number++){ 00045 00046 aVal = _mm_load_ps(aPtr); 00047 bVal = _mm_load_ps(bPtr); 00048 00049 cVal = _mm_mul_ps(aVal, bVal); 00050 00051 dotProdVal = _mm_add_ps(cVal, dotProdVal); 00052 00053 aPtr += 4; 00054 bPtr += 4; 00055 } 00056 00057 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00058 00059 _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector 00060 00061 dotProduct = dotProductVector[0]; 00062 dotProduct += dotProductVector[1]; 00063 dotProduct += dotProductVector[2]; 00064 dotProduct += dotProductVector[3]; 00065 00066 number = quarterPoints * 4; 00067 for(;number < num_points; number++){ 00068 dotProduct += ((*aPtr++) * (*bPtr++)); 00069 } 00070 00071 *result = dotProduct; 00072 00073 } 00074 00075 #endif /*LV_HAVE_SSE*/ 00076 00077 #ifdef LV_HAVE_SSE3 00078 00079 #include <pmmintrin.h> 00080 00081 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { 00082 unsigned int number = 0; 00083 const unsigned int quarterPoints = num_points / 4; 00084 00085 float dotProduct = 0; 00086 const float* aPtr = input; 00087 const float* bPtr = taps; 00088 00089 __m128 aVal, bVal, cVal; 00090 00091 __m128 dotProdVal = _mm_setzero_ps(); 00092 00093 for(;number < quarterPoints; number++){ 00094 00095 aVal = _mm_load_ps(aPtr); 00096 bVal = _mm_load_ps(bPtr); 00097 00098 cVal = _mm_mul_ps(aVal, bVal); 00099 00100 dotProdVal = _mm_hadd_ps(dotProdVal, cVal); 00101 00102 aPtr += 4; 00103 bPtr += 4; 00104 } 00105 00106 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00107 dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); 00108 00109 _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector 00110 00111 dotProduct = dotProductVector[0]; 00112 dotProduct += dotProductVector[1]; 00113 00114 number = quarterPoints * 4; 00115 for(;number < num_points; number++){ 00116 dotProduct += ((*aPtr++) * (*bPtr++)); 00117 } 00118 00119 *result = dotProduct; 00120 } 00121 00122 #endif /*LV_HAVE_SSE3*/ 00123 00124 #ifdef LV_HAVE_SSE4_1 00125 00126 #include <smmintrin.h> 00127 00128 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { 00129 unsigned int number = 0; 00130 const unsigned int sixteenthPoints = num_points / 16; 00131 00132 float dotProduct = 0; 00133 const float* aPtr = input; 00134 const float* bPtr = taps; 00135 00136 __m128 aVal1, bVal1, cVal1; 00137 __m128 aVal2, bVal2, cVal2; 00138 __m128 aVal3, bVal3, cVal3; 00139 __m128 aVal4, bVal4, cVal4; 00140 00141 __m128 dotProdVal = _mm_setzero_ps(); 00142 00143 for(;number < sixteenthPoints; number++){ 00144 00145 aVal1 = _mm_load_ps(aPtr); aPtr += 4; 00146 aVal2 = _mm_load_ps(aPtr); aPtr += 4; 00147 aVal3 = _mm_load_ps(aPtr); aPtr += 4; 00148 aVal4 = _mm_load_ps(aPtr); aPtr += 4; 00149 00150 bVal1 = _mm_load_ps(bPtr); bPtr += 4; 00151 bVal2 = _mm_load_ps(bPtr); bPtr += 4; 00152 bVal3 = _mm_load_ps(bPtr); bPtr += 4; 00153 bVal4 = _mm_load_ps(bPtr); bPtr += 4; 00154 00155 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); 00156 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); 00157 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); 00158 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); 00159 00160 cVal1 = _mm_or_ps(cVal1, cVal2); 00161 cVal3 = _mm_or_ps(cVal3, cVal4); 00162 cVal1 = _mm_or_ps(cVal1, cVal3); 00163 00164 dotProdVal = _mm_add_ps(dotProdVal, cVal1); 00165 } 00166 00167 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; 00168 _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector 00169 00170 dotProduct = dotProductVector[0]; 00171 dotProduct += dotProductVector[1]; 00172 dotProduct += dotProductVector[2]; 00173 dotProduct += dotProductVector[3]; 00174 00175 number = sixteenthPoints * 16; 00176 for(;number < num_points; number++){ 00177 dotProduct += ((*aPtr++) * (*bPtr++)); 00178 } 00179 00180 *result = dotProduct; 00181 } 00182 00183 #endif /*LV_HAVE_SSE4_1*/ 00184 00185 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/