GNU Radio 3.6.3 C++ API
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
2 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 #include <string.h>
8 
9 #ifdef LV_HAVE_SSE3
10 #include<xmmintrin.h>
11 #include<pmmintrin.h>
12 
13 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
14 
15 
16  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
17 
18  lv_32fc_t diff;
19  memset(&diff, 0x0, 2*sizeof(float));
20 
21  float sq_dist = 0.0;
22  int bound = num_bytes >> 5;
23  int leftovers0 = (num_bytes >> 4) & 1;
24  int leftovers1 = (num_bytes >> 3) & 1;
25  int i = 0;
26 
27 
28 
29  xmm1 = _mm_setzero_ps();
30  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
31  xmm2 = _mm_load_ps((float*)&points[0]);
32  xmm8 = _mm_load1_ps(&scalar);
33  xmm1 = _mm_movelh_ps(xmm1, xmm1);
34  xmm3 = _mm_load_ps((float*)&points[2]);
35 
36 
37  for(; i < bound - 1; ++i) {
38 
39  xmm4 = _mm_sub_ps(xmm1, xmm2);
40  xmm5 = _mm_sub_ps(xmm1, xmm3);
41  points += 4;
42  xmm6 = _mm_mul_ps(xmm4, xmm4);
43  xmm7 = _mm_mul_ps(xmm5, xmm5);
44 
45  xmm2 = _mm_load_ps((float*)&points[0]);
46 
47  xmm4 = _mm_hadd_ps(xmm6, xmm7);
48 
49  xmm3 = _mm_load_ps((float*)&points[2]);
50 
51  xmm4 = _mm_mul_ps(xmm4, xmm8);
52 
53  _mm_store_ps(target, xmm4);
54 
55  target += 4;
56 
57  }
58 
59  xmm4 = _mm_sub_ps(xmm1, xmm2);
60  xmm5 = _mm_sub_ps(xmm1, xmm3);
61 
62 
63 
64  points += 4;
65  xmm6 = _mm_mul_ps(xmm4, xmm4);
66  xmm7 = _mm_mul_ps(xmm5, xmm5);
67 
68  xmm4 = _mm_hadd_ps(xmm6, xmm7);
69 
70  xmm4 = _mm_mul_ps(xmm4, xmm8);
71 
72  _mm_store_ps(target, xmm4);
73 
74  target += 4;
75 
76 
77  for(i = 0; i < leftovers0; ++i) {
78 
79  xmm2 = _mm_load_ps((float*)&points[0]);
80 
81  xmm4 = _mm_sub_ps(xmm1, xmm2);
82 
83  points += 2;
84 
85  xmm6 = _mm_mul_ps(xmm4, xmm4);
86 
87  xmm4 = _mm_hadd_ps(xmm6, xmm6);
88 
89  xmm4 = _mm_mul_ps(xmm4, xmm8);
90 
91  _mm_storeh_pi((__m64*)target, xmm4);
92 
93  target += 2;
94  }
95 
96  for(i = 0; i < leftovers1; ++i) {
97 
98  diff = src0[0] - points[0];
99 
100  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
101 
102  target[0] = sq_dist;
103  }
104 }
105 
106 #endif /*LV_HAVE_SSE3*/
107 
108 #ifdef LV_HAVE_GENERIC
109 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
110  lv_32fc_t diff;
111  float sq_dist;
112  unsigned int i = 0;
113 
114  for(; i < num_bytes >> 3; ++i) {
115  diff = src0[0] - points[i];
116 
117  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
118 
119  target[i] = sq_dist;
120  }
121 }
122 
123 #endif /*LV_HAVE_GENERIC*/
124 
125 
126 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/