GNU Radio 3.6.3 C++ API
volk_32fc_x2_conjugate_dot_prod_32fc_u.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
3 
4 
5 #include<volk/volk_complex.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
12 
13  float * res = (float*) result;
14  float * in = (float*) input;
15  float * tp = (float*) taps;
16  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
17  unsigned int isodd = (num_bytes >> 3) &1;
18 
19 
20 
21  float sum0[2] = {0,0};
22  float sum1[2] = {0,0};
23  unsigned int i = 0;
24 
25 
26  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
27 
28  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
29  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
30  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
31  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
32 
33 
34  in += 4;
35  tp += 4;
36 
37  }
38 
39 
40  res[0] = sum0[0] + sum1[0];
41  res[1] = sum0[1] + sum1[1];
42 
43 
44 
45  for(i = 0; i < isodd; ++i) {
46 
47 
48  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
49 
50  }
51  /*
52  for(i = 0; i < num_bytes >> 3; ++i) {
53  *result += input[i] * conjf(taps[i]);
54  }
55  */
56 }
57 
58 #endif /*LV_HAVE_GENERIC*/
59 
60 #ifdef LV_HAVE_SSE3
61 
62 #include <xmmintrin.h>
63 #include <pmmintrin.h>
64 #include <mmintrin.h>
65 
66 
67 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
68 
69  // Variable never used?
70  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
71 
72  union HalfMask {
73  uint32_t intRep[4];
74  __m128 vec;
75  } halfMask;
76 
77  union NegMask {
78  int intRep[4];
79  __m128 vec;
80  } negMask;
81 
82  unsigned int offset = 0;
83  float Rsum=0, Isum=0;
84  float Im,Re;
85 
86  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
87  __m128 zv = {0,0,0,0};
88 
89  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
90  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
91 
92  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
93  negMask.intRep[1] = negMask.intRep[3] = 0;
94 
95  // main loop
96  while(num_bytes >= 4*sizeof(float)){
97 
98  in1 = _mm_loadu_ps( (float*) (input+offset) );
99  in2 = _mm_loadu_ps( (float*) (taps+offset) );
100  Rv = _mm_mul_ps(in1, in2);
101  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
102  Iv = _mm_mul_ps(in1, fehg);
103  Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
104  Ivm = _mm_xor_ps( negMask.vec, Iv );
105  Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
106  _mm_store_ss( &Im, Is );
107  _mm_store_ss( &Re, Rs );
108  num_bytes -= 4*sizeof(float);
109  offset += 2;
110  Rsum += Re;
111  Isum += Im;
112  }
113 
114  // handle the last complex case ...
115  if(num_bytes > 0){
116 
117  if(num_bytes != 4){
118  // bad things are happening
119  }
120 
121  in1 = _mm_loadu_ps( (float*) (input+offset) );
122  in2 = _mm_loadu_ps( (float*) (taps+offset) );
123  Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
124  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
125  Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
126  Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
127  Ivm = _mm_xor_ps( negMask.vec, Iv );
128  Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
129  _mm_store_ss( &Im, Is );
130  _mm_store_ss( &Re, Rs );
131  Rsum += Re;
132  Isum += Im;
133  }
134 
135  result[0] = lv_cmake(Rsum,Isum);
136  return;
137 }
138 
139 #endif /*LV_HAVE_SSE3*/
140 
141 
142 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
143 
144 
145