1 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
2 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
12 static inline void volk_32fc_x2_square_dist_32f_a_sse3(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
unsigned int num_bytes) {
15 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
19 int bound = num_bytes >> 5;
20 int leftovers0 = (num_bytes >> 4) & 1;
21 int leftovers1 = (num_bytes >> 3) & 1;
24 xmm1 = _mm_setzero_ps();
25 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
26 xmm2 = _mm_load_ps((
float*)&points[0]);
27 xmm1 = _mm_movelh_ps(xmm1, xmm1);
28 xmm3 = _mm_load_ps((
float*)&points[2]);
31 for(; i < bound - 1; ++i) {
32 xmm4 = _mm_sub_ps(xmm1, xmm2);
33 xmm5 = _mm_sub_ps(xmm1, xmm3);
35 xmm6 = _mm_mul_ps(xmm4, xmm4);
36 xmm7 = _mm_mul_ps(xmm5, xmm5);
38 xmm2 = _mm_load_ps((
float*)&points[0]);
40 xmm4 = _mm_hadd_ps(xmm6, xmm7);
42 xmm3 = _mm_load_ps((
float*)&points[2]);
44 _mm_store_ps(target, xmm4);
50 xmm4 = _mm_sub_ps(xmm1, xmm2);
51 xmm5 = _mm_sub_ps(xmm1, xmm3);
56 xmm6 = _mm_mul_ps(xmm4, xmm4);
57 xmm7 = _mm_mul_ps(xmm5, xmm5);
59 xmm4 = _mm_hadd_ps(xmm6, xmm7);
61 _mm_store_ps(target, xmm4);
65 for(i = 0; i < leftovers0; ++i) {
67 xmm2 = _mm_load_ps((
float*)&points[0]);
69 xmm4 = _mm_sub_ps(xmm1, xmm2);
73 xmm6 = _mm_mul_ps(xmm4, xmm4);
75 xmm4 = _mm_hadd_ps(xmm6, xmm6);
77 _mm_storeh_pi((__m64*)target, xmm4);
82 for(i = 0; i < leftovers1; ++i) {
84 diff = src0[0] - points[0];
94 #ifdef LV_HAVE_GENERIC
95 static inline void volk_32fc_x2_square_dist_32f_a_generic(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
unsigned int num_bytes) {
100 for(; i < num_bytes >> 3; ++i) {
101 diff = src0[0] - points[i];