1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
11 static inline void volk_32f_x2_dot_prod_32f_u_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
14 const float* aPtr = input;
15 const float* bPtr=
taps;
16 unsigned int number = 0;
18 for(number = 0; number < num_points; number++){
19 dotProduct += ((*aPtr++) * (*bPtr++));
31 static inline void volk_32f_x2_dot_prod_32f_u_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
33 unsigned int number = 0;
34 const unsigned int sixteenthPoints = num_points / 16;
37 const float* aPtr = input;
38 const float* bPtr =
taps;
40 __m128 a0Val, a1Val, a2Val, a3Val;
41 __m128 b0Val, b1Val, b2Val, b3Val;
42 __m128 c0Val, c1Val, c2Val, c3Val;
44 __m128 dotProdVal0 = _mm_setzero_ps();
45 __m128 dotProdVal1 = _mm_setzero_ps();
46 __m128 dotProdVal2 = _mm_setzero_ps();
47 __m128 dotProdVal3 = _mm_setzero_ps();
49 for(;number < sixteenthPoints; number++){
51 a0Val = _mm_loadu_ps(aPtr);
52 a1Val = _mm_loadu_ps(aPtr+4);
53 a2Val = _mm_loadu_ps(aPtr+8);
54 a3Val = _mm_loadu_ps(aPtr+12);
55 b0Val = _mm_loadu_ps(bPtr);
56 b1Val = _mm_loadu_ps(bPtr+4);
57 b2Val = _mm_loadu_ps(bPtr+8);
58 b3Val = _mm_loadu_ps(bPtr+12);
60 c0Val = _mm_mul_ps(a0Val, b0Val);
61 c1Val = _mm_mul_ps(a1Val, b1Val);
62 c2Val = _mm_mul_ps(a2Val, b2Val);
63 c3Val = _mm_mul_ps(a3Val, b3Val);
65 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
66 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
67 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
68 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
74 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
75 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
76 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
80 _mm_store_ps(dotProductVector,dotProdVal0);
82 dotProduct = dotProductVector[0];
83 dotProduct += dotProductVector[1];
84 dotProduct += dotProductVector[2];
85 dotProduct += dotProductVector[3];
87 number = sixteenthPoints*16;
88 for(;number < num_points; number++){
89 dotProduct += ((*aPtr++) * (*bPtr++));
100 #include <pmmintrin.h>
102 static inline void volk_32f_x2_dot_prod_32f_u_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
103 unsigned int number = 0;
104 const unsigned int sixteenthPoints = num_points / 16;
106 float dotProduct = 0;
107 const float* aPtr = input;
108 const float* bPtr =
taps;
110 __m128 a0Val, a1Val, a2Val, a3Val;
111 __m128 b0Val, b1Val, b2Val, b3Val;
112 __m128 c0Val, c1Val, c2Val, c3Val;
114 __m128 dotProdVal0 = _mm_setzero_ps();
115 __m128 dotProdVal1 = _mm_setzero_ps();
116 __m128 dotProdVal2 = _mm_setzero_ps();
117 __m128 dotProdVal3 = _mm_setzero_ps();
119 for(;number < sixteenthPoints; number++){
121 a0Val = _mm_loadu_ps(aPtr);
122 a1Val = _mm_loadu_ps(aPtr+4);
123 a2Val = _mm_loadu_ps(aPtr+8);
124 a3Val = _mm_loadu_ps(aPtr+12);
125 b0Val = _mm_loadu_ps(bPtr);
126 b1Val = _mm_loadu_ps(bPtr+4);
127 b2Val = _mm_loadu_ps(bPtr+8);
128 b3Val = _mm_loadu_ps(bPtr+12);
130 c0Val = _mm_mul_ps(a0Val, b0Val);
131 c1Val = _mm_mul_ps(a1Val, b1Val);
132 c2Val = _mm_mul_ps(a2Val, b2Val);
133 c3Val = _mm_mul_ps(a3Val, b3Val);
135 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
136 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
137 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
138 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
144 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
145 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
146 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
149 _mm_store_ps(dotProductVector,dotProdVal0);
151 dotProduct = dotProductVector[0];
152 dotProduct += dotProductVector[1];
153 dotProduct += dotProductVector[2];
154 dotProduct += dotProductVector[3];
156 number = sixteenthPoints*16;
157 for(;number < num_points; number++){
158 dotProduct += ((*aPtr++) * (*bPtr++));
161 *result = dotProduct;
166 #ifdef LV_HAVE_SSE4_1
168 #include <smmintrin.h>
170 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
171 unsigned int number = 0;
172 const unsigned int sixteenthPoints = num_points / 16;
174 float dotProduct = 0;
175 const float* aPtr = input;
176 const float* bPtr =
taps;
178 __m128 aVal1, bVal1, cVal1;
179 __m128 aVal2, bVal2, cVal2;
180 __m128 aVal3, bVal3, cVal3;
181 __m128 aVal4, bVal4, cVal4;
183 __m128 dotProdVal = _mm_setzero_ps();
185 for(;number < sixteenthPoints; number++){
187 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
188 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
189 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
190 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
192 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
193 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
194 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
195 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
197 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
198 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
199 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
200 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
202 cVal1 = _mm_or_ps(cVal1, cVal2);
203 cVal3 = _mm_or_ps(cVal3, cVal4);
204 cVal1 = _mm_or_ps(cVal1, cVal3);
206 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
210 _mm_store_ps(dotProductVector, dotProdVal);
212 dotProduct = dotProductVector[0];
213 dotProduct += dotProductVector[1];
214 dotProduct += dotProductVector[2];
215 dotProduct += dotProductVector[3];
217 number = sixteenthPoints * 16;
218 for(;number < num_points; number++){
219 dotProduct += ((*aPtr++) * (*bPtr++));
222 *result = dotProduct;
229 #include <immintrin.h>
231 static inline void volk_32f_x2_dot_prod_32f_u_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
233 unsigned int number = 0;
234 const unsigned int sixteenthPoints = num_points / 16;
236 float dotProduct = 0;
237 const float* aPtr = input;
238 const float* bPtr =
taps;
244 __m256 dotProdVal0 = _mm256_setzero_ps();
245 __m256 dotProdVal1 = _mm256_setzero_ps();
247 for(;number < sixteenthPoints; number++){
249 a0Val = _mm256_loadu_ps(aPtr);
250 a1Val = _mm256_loadu_ps(aPtr+8);
251 b0Val = _mm256_loadu_ps(bPtr);
252 b1Val = _mm256_loadu_ps(bPtr+8);
254 c0Val = _mm256_mul_ps(a0Val, b0Val);
255 c1Val = _mm256_mul_ps(a1Val, b1Val);
257 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
258 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
268 _mm256_storeu_ps(dotProductVector,dotProdVal0);
270 dotProduct = dotProductVector[0];
271 dotProduct += dotProductVector[1];
272 dotProduct += dotProductVector[2];
273 dotProduct += dotProductVector[3];
274 dotProduct += dotProductVector[4];
275 dotProduct += dotProductVector[5];
276 dotProduct += dotProductVector[6];
277 dotProduct += dotProductVector[7];
279 number = sixteenthPoints*16;
280 for(;number < num_points; number++){
281 dotProduct += ((*aPtr++) * (*bPtr++));
284 *result = dotProduct;