1 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
8 #define ROTATOR_RELOAD 512
11 #ifdef LV_HAVE_GENERIC
23 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
28 *outVector++ = *inVector++ * (*phase);
29 (*phase) *= phase_inc;
31 (*phase) /= abs((*phase));
34 *outVector++ = *inVector++ * (*phase);
35 (*phase) *= phase_inc;
43 #include <smmintrin.h>
45 static inline void volk_32fc_s32fc_x2_rotator_32fc_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
49 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
51 unsigned int i, j = 0;
53 for(i = 0; i < 2; ++
i) {
63 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
65 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
68 const unsigned int halfPoints = num_points / 2;
74 aVal = _mm_load_ps((
float*)aPtr);
76 yl = _mm_moveldup_ps(phase_Val);
77 yh = _mm_movehdup_ps(phase_Val);
78 ylp = _mm_moveldup_ps(inc_Val);
79 yhp = _mm_movehdup_ps(inc_Val);
81 tmp1 = _mm_mul_ps(aVal, yl);
82 tmp1p = _mm_mul_ps(phase_Val, ylp);
84 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
85 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
86 tmp2 = _mm_mul_ps(aVal, yh);
87 tmp2p = _mm_mul_ps(phase_Val, yhp);
89 z = _mm_addsub_ps(tmp1, tmp2);
90 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
92 _mm_store_ps((
float*)cPtr, z);
97 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
98 tmp2 = _mm_hadd_ps(tmp1, tmp1);
99 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
100 phase_Val = _mm_div_ps(phase_Val, tmp1);
103 aVal = _mm_load_ps((
float*)aPtr);
105 yl = _mm_moveldup_ps(phase_Val);
106 yh = _mm_movehdup_ps(phase_Val);
107 ylp = _mm_moveldup_ps(inc_Val);
108 yhp = _mm_movehdup_ps(inc_Val);
110 tmp1 = _mm_mul_ps(aVal, yl);
112 tmp1p = _mm_mul_ps(phase_Val, ylp);
114 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
115 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
116 tmp2 = _mm_mul_ps(aVal, yh);
117 tmp2p = _mm_mul_ps(phase_Val, yhp);
119 z = _mm_addsub_ps(tmp1, tmp2);
120 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
122 _mm_store_ps((
float*)cPtr, z);
128 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
129 for(i = 0; i < num_points%2; ++
i) {
130 *cPtr++ = *aPtr++ * phase_Ptr[0];
131 phase_Ptr[0] *= (phase_inc);
134 (*phase) = phase_Ptr[0];
142 #include <immintrin.h>
160 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
162 unsigned int i, j = 0;
164 for(i = 0; i < 4; ++
i) {
165 phase_Ptr[
i] *= incr;
174 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
176 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
178 const unsigned int fourthPoints = num_points / 4;
181 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
184 aVal = _mm256_load_ps((
float*)aPtr);
186 yl = _mm256_moveldup_ps(phase_Val);
187 yh = _mm256_movehdup_ps(phase_Val);
188 ylp = _mm256_moveldup_ps(inc_Val);
189 yhp = _mm256_movehdup_ps(inc_Val);
191 tmp1 = _mm256_mul_ps(aVal, yl);
192 tmp1p = _mm256_mul_ps(phase_Val, ylp);
194 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
195 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
196 tmp2 = _mm256_mul_ps(aVal, yh);
197 tmp2p = _mm256_mul_ps(phase_Val, yhp);
199 z = _mm256_addsub_ps(tmp1, tmp2);
200 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
202 _mm256_store_ps((
float*)cPtr, z);
207 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
208 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
209 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
210 phase_Val = _mm256_div_ps(phase_Val, tmp1);
213 aVal = _mm256_load_ps((
float*)aPtr);
215 yl = _mm256_moveldup_ps(phase_Val);
216 yh = _mm256_movehdup_ps(phase_Val);
217 ylp = _mm256_moveldup_ps(inc_Val);
218 yhp = _mm256_movehdup_ps(inc_Val);
220 tmp1 = _mm256_mul_ps(aVal, yl);
222 tmp1p = _mm256_mul_ps(phase_Val, ylp);
224 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
225 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
226 tmp2 = _mm256_mul_ps(aVal, yh);
227 tmp2p = _mm256_mul_ps(phase_Val, yhp);
229 z = _mm256_addsub_ps(tmp1, tmp2);
230 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
232 _mm256_store_ps((
float*)cPtr, z);
238 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
239 for(i = 0; i < num_points%4; ++
i) {
240 *cPtr++ = *aPtr++ * phase_Ptr[0];
241 phase_Ptr[0] *= (phase_inc);
244 (*phase) = phase_Ptr[0];