1 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
2 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
13 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
float scalar,
unsigned int num_bytes) {
16 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
19 memset(&diff, 0x0, 2*
sizeof(
float));
22 int bound = num_bytes >> 5;
23 int leftovers0 = (num_bytes >> 4) & 1;
24 int leftovers1 = (num_bytes >> 3) & 1;
29 xmm1 = _mm_setzero_ps();
30 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
31 xmm2 = _mm_load_ps((
float*)&points[0]);
32 xmm8 = _mm_load1_ps(&scalar);
33 xmm1 = _mm_movelh_ps(xmm1, xmm1);
34 xmm3 = _mm_load_ps((
float*)&points[2]);
37 for(; i < bound - 1; ++i) {
39 xmm4 = _mm_sub_ps(xmm1, xmm2);
40 xmm5 = _mm_sub_ps(xmm1, xmm3);
42 xmm6 = _mm_mul_ps(xmm4, xmm4);
43 xmm7 = _mm_mul_ps(xmm5, xmm5);
45 xmm2 = _mm_load_ps((
float*)&points[0]);
47 xmm4 = _mm_hadd_ps(xmm6, xmm7);
49 xmm3 = _mm_load_ps((
float*)&points[2]);
51 xmm4 = _mm_mul_ps(xmm4, xmm8);
53 _mm_store_ps(target, xmm4);
59 xmm4 = _mm_sub_ps(xmm1, xmm2);
60 xmm5 = _mm_sub_ps(xmm1, xmm3);
65 xmm6 = _mm_mul_ps(xmm4, xmm4);
66 xmm7 = _mm_mul_ps(xmm5, xmm5);
68 xmm4 = _mm_hadd_ps(xmm6, xmm7);
70 xmm4 = _mm_mul_ps(xmm4, xmm8);
72 _mm_store_ps(target, xmm4);
77 for(i = 0; i < leftovers0; ++i) {
79 xmm2 = _mm_load_ps((
float*)&points[0]);
81 xmm4 = _mm_sub_ps(xmm1, xmm2);
85 xmm6 = _mm_mul_ps(xmm4, xmm4);
87 xmm4 = _mm_hadd_ps(xmm6, xmm6);
89 xmm4 = _mm_mul_ps(xmm4, xmm8);
91 _mm_storeh_pi((__m64*)target, xmm4);
96 for(i = 0; i < leftovers1; ++i) {
98 diff = src0[0] - points[0];
108 #ifdef LV_HAVE_GENERIC
109 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
float scalar,
unsigned int num_bytes) {
114 for(; i < num_bytes >> 3; ++i) {
115 diff = src0[0] - points[i];