gr-baz Package
volk_32f_x3_sum_of_poly_32f_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifndef MAX
9 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
10 #endif
11 
12 #ifdef LV_HAVE_SSE3
13 #include<xmmintrin.h>
14 #include<pmmintrin.h>
15 
16 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
17 
18 
19  float result = 0.0;
20  float fst = 0.0;
21  float sq = 0.0;
22  float thrd = 0.0;
23  float frth = 0.0;
24  //float fith = 0.0;
25 
26 
27 
28  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
29 
30  xmm9 = _mm_setzero_ps();
31  xmm1 = _mm_setzero_ps();
32 
33  xmm0 = _mm_load1_ps(&center_point_array[0]);
34  xmm6 = _mm_load1_ps(&center_point_array[1]);
35  xmm7 = _mm_load1_ps(&center_point_array[2]);
36  xmm8 = _mm_load1_ps(&center_point_array[3]);
37  //xmm11 = _mm_load1_ps(&center_point_array[4]);
38  xmm10 = _mm_load1_ps(cutoff);
39 
40  int bound = num_bytes >> 4;
41  int leftovers = (num_bytes >> 2) & 3;
42  int i = 0;
43 
44  for(; i < bound; ++i) {
45  xmm2 = _mm_load_ps(src0);
46  xmm2 = _mm_max_ps(xmm10, xmm2);
47  xmm3 = _mm_mul_ps(xmm2, xmm2);
48  xmm4 = _mm_mul_ps(xmm2, xmm3);
49  xmm5 = _mm_mul_ps(xmm3, xmm3);
50  //xmm12 = _mm_mul_ps(xmm3, xmm4);
51 
52  xmm2 = _mm_mul_ps(xmm2, xmm0);
53  xmm3 = _mm_mul_ps(xmm3, xmm6);
54  xmm4 = _mm_mul_ps(xmm4, xmm7);
55  xmm5 = _mm_mul_ps(xmm5, xmm8);
56  //xmm12 = _mm_mul_ps(xmm12, xmm11);
57 
58  xmm2 = _mm_add_ps(xmm2, xmm3);
59  xmm3 = _mm_add_ps(xmm4, xmm5);
60 
61  src0 += 4;
62 
63  xmm9 = _mm_add_ps(xmm2, xmm9);
64 
65  xmm1 = _mm_add_ps(xmm3, xmm1);
66 
67  //xmm9 = _mm_add_ps(xmm12, xmm9);
68  }
69 
70  xmm2 = _mm_hadd_ps(xmm9, xmm1);
71  xmm3 = _mm_hadd_ps(xmm2, xmm2);
72  xmm4 = _mm_hadd_ps(xmm3, xmm3);
73 
74  _mm_store_ss(&result, xmm4);
75 
76 
77 
78  for(i = 0; i < leftovers; ++i) {
79  fst = src0[i];
80  fst = MAX(fst, *cutoff);
81  sq = fst * fst;
82  thrd = fst * sq;
83  frth = sq * sq;
84  //fith = sq * thrd;
85 
86  result += (center_point_array[0] * fst +
87  center_point_array[1] * sq +
88  center_point_array[2] * thrd +
89  center_point_array[3] * frth);// +
90  //center_point_array[4] * fith);
91  }
92 
93  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
94 
95  target[0] = result;
96 }
97 
98 
99 #endif /*LV_HAVE_SSE3*/
100 
101 #ifdef LV_HAVE_GENERIC
102 
103 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
104 
105 
106 
107  float result = 0.0;
108  float fst = 0.0;
109  float sq = 0.0;
110  float thrd = 0.0;
111  float frth = 0.0;
112  //float fith = 0.0;
113 
114 
115 
116  unsigned int i = 0;
117 
118  for(; i < num_bytes >> 2; ++i) {
119  fst = src0[i];
120  fst = MAX(fst, *cutoff);
121 
122  sq = fst * fst;
123  thrd = fst * sq;
124  frth = sq * sq;
125  //fith = sq * thrd;
126 
127  result += (center_point_array[0] * fst +
128  center_point_array[1] * sq +
129  center_point_array[2] * thrd +
130  center_point_array[3] * frth); //+
131  //center_point_array[4] * fith);
132  /*printf("%f12...%d\n", (center_point_array[0] * fst +
133  center_point_array[1] * sq +
134  center_point_array[2] * thrd +
135  center_point_array[3] * frth) +
136  //center_point_array[4] * fith) +
137  (center_point_array[4]), i);
138  */
139  }
140 
141  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
142 
143 
144 
145  *target = result;
146 }
147 
148 #endif /*LV_HAVE_GENERIC*/
149 
150 
151 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/