GNU Radio 3.6.3.1 C++ API
volk_16i_x4_quad_max_star_16i_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
2 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
3 
4 
5 #include<inttypes.h>
6 #include<stdio.h>
7 
8 
9 
10 
11 
12 #ifdef LV_HAVE_SSE2
13 
14 #include<emmintrin.h>
15 
16 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
17 
18 
19 
20 
21  int i = 0;
22 
23  int bound = (num_bytes >> 4);
24  int bound_copy = bound;
25  int leftovers = (num_bytes >> 1) & 7;
26 
27  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
28  p_target = (__m128i*) target;
29  p_src0 = (__m128i*)src0;
30  p_src1 = (__m128i*)src1;
31  p_src2 = (__m128i*)src2;
32  p_src3 = (__m128i*)src3;
33 
34 
35 
36  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
37 
38  while(bound_copy > 0) {
39 
40  xmm1 = _mm_load_si128(p_src0);
41  xmm2 = _mm_load_si128(p_src1);
42  xmm3 = _mm_load_si128(p_src2);
43  xmm4 = _mm_load_si128(p_src3);
44 
45  xmm5 = _mm_setzero_si128();
46  xmm6 = _mm_setzero_si128();
47  xmm7 = xmm1;
48  xmm8 = xmm3;
49 
50 
51  xmm1 = _mm_sub_epi16(xmm2, xmm1);
52 
53 
54 
55  xmm3 = _mm_sub_epi16(xmm4, xmm3);
56 
57  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
58  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
59 
60 
61 
62  xmm2 = _mm_and_si128(xmm5, xmm2);
63  xmm4 = _mm_and_si128(xmm6, xmm4);
64  xmm5 = _mm_andnot_si128(xmm5, xmm7);
65  xmm6 = _mm_andnot_si128(xmm6, xmm8);
66 
67  xmm5 = _mm_add_epi16(xmm2, xmm5);
68  xmm6 = _mm_add_epi16(xmm4, xmm6);
69 
70 
71  xmm1 = _mm_xor_si128(xmm1, xmm1);
72  xmm2 = xmm5;
73  xmm5 = _mm_sub_epi16(xmm6, xmm5);
74  p_src0 += 1;
75  bound_copy -= 1;
76 
77  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
78  p_src1 += 1;
79 
80  xmm6 = _mm_and_si128(xmm1, xmm6);
81 
82  xmm1 = _mm_andnot_si128(xmm1, xmm2);
83  p_src2 += 1;
84 
85 
86 
87  xmm1 = _mm_add_epi16(xmm6, xmm1);
88  p_src3 += 1;
89 
90 
91  _mm_store_si128(p_target, xmm1);
92  p_target += 1;
93 
94  }
95 
96 
97  /*asm volatile
98  (
99  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
100  "cmp $0, %[bound]\n\t"
101  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
102 
103  "movaps (%[src0]), %%xmm1\n\t"
104  "movaps (%[src1]), %%xmm2\n\t"
105  "movaps (%[src2]), %%xmm3\n\t"
106  "movaps (%[src3]), %%xmm4\n\t"
107 
108  "pxor %%xmm5, %%xmm5\n\t"
109  "pxor %%xmm6, %%xmm6\n\t"
110  "movaps %%xmm1, %%xmm7\n\t"
111  "movaps %%xmm3, %%xmm8\n\t"
112  "psubw %%xmm2, %%xmm1\n\t"
113  "psubw %%xmm4, %%xmm3\n\t"
114 
115  "pcmpgtw %%xmm1, %%xmm5\n\t"
116  "pcmpgtw %%xmm3, %%xmm6\n\t"
117 
118  "pand %%xmm5, %%xmm2\n\t"
119  "pand %%xmm6, %%xmm4\n\t"
120  "pandn %%xmm7, %%xmm5\n\t"
121  "pandn %%xmm8, %%xmm6\n\t"
122 
123  "paddw %%xmm2, %%xmm5\n\t"
124  "paddw %%xmm4, %%xmm6\n\t"
125 
126  "pxor %%xmm1, %%xmm1\n\t"
127  "movaps %%xmm5, %%xmm2\n\t"
128 
129  "psubw %%xmm6, %%xmm5\n\t"
130  "add $16, %[src0]\n\t"
131  "add $-1, %[bound]\n\t"
132 
133  "pcmpgtw %%xmm5, %%xmm1\n\t"
134  "add $16, %[src1]\n\t"
135 
136  "pand %%xmm1, %%xmm6\n\t"
137 
138  "pandn %%xmm2, %%xmm1\n\t"
139  "add $16, %[src2]\n\t"
140 
141  "paddw %%xmm6, %%xmm1\n\t"
142  "add $16, %[src3]\n\t"
143 
144  "movaps %%xmm1, (%[target])\n\t"
145  "addw $16, %[target]\n\t"
146  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
147 
148  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
149  :
150  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
151  :
152  );
153  */
154 
155  short temp0 = 0;
156  short temp1 = 0;
157  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
158  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
159  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
160  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
161  }
162  return;
163 
164 
165 }
166 
167 #endif /*LV_HAVE_SSE2*/
168 
169 
170 #ifdef LV_HAVE_GENERIC
171 static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
172 
173  int i = 0;
174 
175  int bound = num_bytes >> 1;
176 
177  short temp0 = 0;
178  short temp1 = 0;
179  for(i = 0; i < bound; ++i) {
180  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
181  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
182  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
183  }
184 }
185 
186 
187 
188 
189 #endif /*LV_HAVE_GENERIC*/
190 
191 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/