GNU Radio 3.6.4 C++ API
volk_32fc_x2_conjugate_dot_prod_32fc_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
2 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
3 
4 #include <volk/volk_common.h>
5 #include<volk/volk_complex.h>
6 #include<stdio.h>
7 
8 
9 #ifdef LV_HAVE_GENERIC
10 
11 
12 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
13 
14  float * res = (float*) result;
15  float * in = (float*) input;
16  float * tp = (float*) taps;
17  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
18  unsigned int isodd = (num_bytes >> 3) &1;
19 
20 
21 
22  float sum0[2] = {0,0};
23  float sum1[2] = {0,0};
24  unsigned int i = 0;
25 
26 
27  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
28 
29 
30  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
31  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
32  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
33  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
34 
35 
36  in += 4;
37  tp += 4;
38 
39  }
40 
41 
42  res[0] = sum0[0] + sum1[0];
43  res[1] = sum0[1] + sum1[1];
44 
45 
46 
47  for(i = 0; i < isodd; ++i) {
48 
49 
50  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
51 
52  }
53  /*
54  for(i = 0; i < num_bytes >> 3; ++i) {
55  *result += input[i] * conjf(taps[i]);
56  }
57  */
58 }
59 
60 #endif /*LV_HAVE_GENERIC*/
61 
62 
63 #if LV_HAVE_SSE && LV_HAVE_64
64 
65 
66 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
67 
68  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
69 
70 
71 
72 
73  asm volatile
74  (
75  "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
76  "# const float *taps, unsigned num_bytes)\n\t"
77  "# float sum0 = 0;\n\t"
78  "# float sum1 = 0;\n\t"
79  "# float sum2 = 0;\n\t"
80  "# float sum3 = 0;\n\t"
81  "# do {\n\t"
82  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
83  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
84  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
85  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
86  "# input += 4;\n\t"
87  "# taps += 4; \n\t"
88  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
89  "# result[0] = sum0 + sum2;\n\t"
90  "# result[1] = sum1 + sum3;\n\t"
91  "# TODO: prefetch and better scheduling\n\t"
92  " xor %%r9, %%r9\n\t"
93  " xor %%r10, %%r10\n\t"
94  " movq %[conjugator], %%r9\n\t"
95  " movq %%rcx, %%rax\n\t"
96  " movaps 0(%%r9), %%xmm8\n\t"
97  " movq %%rcx, %%r8\n\t"
98  " movq %[rsi], %%r9\n\t"
99  " movq %[rdx], %%r10\n\t"
100  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
101  " movaps 0(%%r9), %%xmm0\n\t"
102  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
103  " movups 0(%%r10), %%xmm2\n\t"
104  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
105  " shr $4, %%r8\n\t"
106  " xorps %%xmm8, %%xmm2\n\t"
107  " jmp .%=L1_test\n\t"
108  " # 4 taps / loop\n\t"
109  " # something like ?? cycles / loop\n\t"
110  ".%=Loop1: \n\t"
111  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
112  "# movaps (%%r9), %%xmmA\n\t"
113  "# movaps (%%r10), %%xmmB\n\t"
114  "# movaps %%xmmA, %%xmmZ\n\t"
115  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
116  "# mulps %%xmmB, %%xmmA\n\t"
117  "# mulps %%xmmZ, %%xmmB\n\t"
118  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
119  "# xorps %%xmmPN, %%xmmA\n\t"
120  "# movaps %%xmmA, %%xmmZ\n\t"
121  "# unpcklps %%xmmB, %%xmmA\n\t"
122  "# unpckhps %%xmmB, %%xmmZ\n\t"
123  "# movaps %%xmmZ, %%xmmY\n\t"
124  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
125  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
126  "# addps %%xmmZ, %%xmmA\n\t"
127  "# addps %%xmmA, %%xmmC\n\t"
128  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
129  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
130  " movaps 16(%%r9), %%xmm1\n\t"
131  " movaps %%xmm0, %%xmm4\n\t"
132  " mulps %%xmm2, %%xmm0\n\t"
133  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
134  " movaps 16(%%r10), %%xmm3\n\t"
135  " movaps %%xmm1, %%xmm5\n\t"
136  " xorps %%xmm8, %%xmm3\n\t"
137  " addps %%xmm0, %%xmm6\n\t"
138  " mulps %%xmm3, %%xmm1\n\t"
139  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
140  " addps %%xmm1, %%xmm6\n\t"
141  " mulps %%xmm4, %%xmm2\n\t"
142  " movaps 32(%%r9), %%xmm0\n\t"
143  " addps %%xmm2, %%xmm7\n\t"
144  " mulps %%xmm5, %%xmm3\n\t"
145  " add $32, %%r9\n\t"
146  " movaps 32(%%r10), %%xmm2\n\t"
147  " addps %%xmm3, %%xmm7\n\t"
148  " add $32, %%r10\n\t"
149  " xorps %%xmm8, %%xmm2\n\t"
150  ".%=L1_test:\n\t"
151  " dec %%rax\n\t"
152  " jge .%=Loop1\n\t"
153  " # We've handled the bulk of multiplies up to here.\n\t"
154  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
155  " # If so, we've got 2 more taps to do.\n\t"
156  " and $1, %%r8\n\t"
157  " je .%=Leven\n\t"
158  " # The count was odd, do 2 more taps.\n\t"
159  " # Note that we've already got mm0/mm2 preloaded\n\t"
160  " # from the main loop.\n\t"
161  " movaps %%xmm0, %%xmm4\n\t"
162  " mulps %%xmm2, %%xmm0\n\t"
163  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
164  " addps %%xmm0, %%xmm6\n\t"
165  " mulps %%xmm4, %%xmm2\n\t"
166  " addps %%xmm2, %%xmm7\n\t"
167  ".%=Leven:\n\t"
168  " # neg inversor\n\t"
169  " xorps %%xmm1, %%xmm1\n\t"
170  " mov $0x80000000, %%r9\n\t"
171  " movd %%r9, %%xmm1\n\t"
172  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
173  " # pfpnacc\n\t"
174  " xorps %%xmm1, %%xmm6\n\t"
175  " movaps %%xmm6, %%xmm2\n\t"
176  " unpcklps %%xmm7, %%xmm6\n\t"
177  " unpckhps %%xmm7, %%xmm2\n\t"
178  " movaps %%xmm2, %%xmm3\n\t"
179  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
180  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
181  " addps %%xmm2, %%xmm6\n\t"
182  " # xmm6 = r1 i2 r3 i4\n\t"
183  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
184  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
185  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
186  :
187  :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
188  :"rax", "r8", "r9", "r10"
189  );
190 
191 
192  int getem = num_bytes % 16;
193 
194 
195  for(; getem > 0; getem -= 8) {
196 
197 
198  *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
199 
200  }
201 
202  return;
203 }
204 #endif
205 
206 #if LV_HAVE_SSE && LV_HAVE_32
207 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
208 
209  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
210 
211  int bound = num_bytes >> 4;
212  int leftovers = num_bytes % 16;
213 
214 
215  asm volatile
216  (
217  " #pushl %%ebp\n\t"
218  " #movl %%esp, %%ebp\n\t"
219  " #movl 12(%%ebp), %%eax # input\n\t"
220  " #movl 16(%%ebp), %%edx # taps\n\t"
221  " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
222  " movaps 0(%[conjugator]), %%xmm1\n\t"
223  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
224  " movaps 0(%[eax]), %%xmm0\n\t"
225  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
226  " movaps 0(%[edx]), %%xmm2\n\t"
227  " movl %[ecx], (%[out])\n\t"
228  " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
229 
230  " xorps %%xmm1, %%xmm2\n\t"
231  " jmp .%=L1_test\n\t"
232  " # 4 taps / loop\n\t"
233  " # something like ?? cycles / loop\n\t"
234  ".%=Loop1: \n\t"
235  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
236  "# movaps (%[eax]), %%xmmA\n\t"
237  "# movaps (%[edx]), %%xmmB\n\t"
238  "# movaps %%xmmA, %%xmmZ\n\t"
239  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
240  "# mulps %%xmmB, %%xmmA\n\t"
241  "# mulps %%xmmZ, %%xmmB\n\t"
242  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
243  "# xorps %%xmmPN, %%xmmA\n\t"
244  "# movaps %%xmmA, %%xmmZ\n\t"
245  "# unpcklps %%xmmB, %%xmmA\n\t"
246  "# unpckhps %%xmmB, %%xmmZ\n\t"
247  "# movaps %%xmmZ, %%xmmY\n\t"
248  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
249  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
250  "# addps %%xmmZ, %%xmmA\n\t"
251  "# addps %%xmmA, %%xmmC\n\t"
252  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
253  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
254  " movaps 16(%[edx]), %%xmm3\n\t"
255  " movaps %%xmm0, %%xmm4\n\t"
256  " xorps %%xmm1, %%xmm3\n\t"
257  " mulps %%xmm2, %%xmm0\n\t"
258  " movaps 16(%[eax]), %%xmm1\n\t"
259  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
260  " movaps %%xmm1, %%xmm5\n\t"
261  " addps %%xmm0, %%xmm6\n\t"
262  " mulps %%xmm3, %%xmm1\n\t"
263  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
264  " addps %%xmm1, %%xmm6\n\t"
265  " movaps 0(%[conjugator]), %%xmm1\n\t"
266  " mulps %%xmm4, %%xmm2\n\t"
267  " movaps 32(%[eax]), %%xmm0\n\t"
268  " addps %%xmm2, %%xmm7\n\t"
269  " mulps %%xmm5, %%xmm3\n\t"
270  " addl $32, %[eax]\n\t"
271  " movaps 32(%[edx]), %%xmm2\n\t"
272  " addps %%xmm3, %%xmm7\n\t"
273  " xorps %%xmm1, %%xmm2\n\t"
274  " addl $32, %[edx]\n\t"
275  ".%=L1_test:\n\t"
276  " decl %[ecx]\n\t"
277  " jge .%=Loop1\n\t"
278  " # We've handled the bulk of multiplies up to here.\n\t"
279  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
280  " # If so, we've got 2 more taps to do.\n\t"
281  " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
282  " shrl $4, %[ecx]\n\t"
283  " andl $1, %[ecx]\n\t"
284  " je .%=Leven\n\t"
285  " # The count was odd, do 2 more taps.\n\t"
286  " # Note that we've already got mm0/mm2 preloaded\n\t"
287  " # from the main loop.\n\t"
288  " movaps %%xmm0, %%xmm4\n\t"
289  " mulps %%xmm2, %%xmm0\n\t"
290  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
291  " addps %%xmm0, %%xmm6\n\t"
292  " mulps %%xmm4, %%xmm2\n\t"
293  " addps %%xmm2, %%xmm7\n\t"
294  ".%=Leven:\n\t"
295  " # neg inversor\n\t"
296  " #movl 8(%%ebp), %[eax] \n\t"
297  " xorps %%xmm1, %%xmm1\n\t"
298  " movl $0x80000000, (%[out])\n\t"
299  " movss (%[out]), %%xmm1\n\t"
300  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
301  " # pfpnacc\n\t"
302  " xorps %%xmm1, %%xmm6\n\t"
303  " movaps %%xmm6, %%xmm2\n\t"
304  " unpcklps %%xmm7, %%xmm6\n\t"
305  " unpckhps %%xmm7, %%xmm2\n\t"
306  " movaps %%xmm2, %%xmm3\n\t"
307  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
308  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
309  " addps %%xmm2, %%xmm6\n\t"
310  " # xmm6 = r1 i2 r3 i4\n\t"
311  " #movl 8(%%ebp), %[eax] # @result\n\t"
312  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
313  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
314  " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
315  " #popl %%ebp\n\t"
316  :
317  : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
318  );
319 
320 
321 
322 
323  printf("%d, %d\n", leftovers, bound);
324 
325  for(; leftovers > 0; leftovers -= 8) {
326 
327 
328  *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
329 
330  }
331 
332  return;
333 
334 
335 
336 
337 
338 
339 }
340 
341 #endif /*LV_HAVE_SSE*/
342 
343 
344 
345 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/