GNU Radio 3.6.4 C++ API
volk_16i_max_star_horizontal_16i_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
2 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
3 
4 #include <volk/volk_common.h>
5 
6 #include<inttypes.h>
7 #include<stdio.h>
8 
9 
10 #ifdef LV_HAVE_SSSE3
11 
12 #include<xmmintrin.h>
13 #include<emmintrin.h>
14 #include<tmmintrin.h>
15 
16 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
17 
18  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
19  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
20  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
21  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
22 
23 
24 
25  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
26  __m128i xmm5, xmm6, xmm7, xmm8;
27 
28  xmm4 = _mm_load_si128((__m128i*)shufmask0);
29  xmm5 = _mm_load_si128((__m128i*)shufmask1);
30  xmm6 = _mm_load_si128((__m128i*)andmask0);
31  xmm7 = _mm_load_si128((__m128i*)andmask1);
32 
33  __m128i *p_target, *p_src0;
34 
35  p_target = (__m128i*)target;
36  p_src0 = (__m128i*)src0;
37 
38  int bound = num_bytes >> 5;
39  int intermediate = (num_bytes >> 4) & 1;
40  int leftovers = (num_bytes >> 1) & 7;
41 
42  int i = 0;
43 
44 
45  for(i = 0; i < bound; ++i) {
46 
47  xmm0 = _mm_load_si128(p_src0);
48  xmm1 = _mm_load_si128(&p_src0[1]);
49 
50 
51 
52  xmm2 = _mm_xor_si128(xmm2, xmm2);
53  p_src0 += 2;
54 
55  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
56 
57  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
58 
59  xmm8 = _mm_and_si128(xmm2, xmm6);
60  xmm3 = _mm_and_si128(xmm2, xmm7);
61 
62 
63  xmm8 = _mm_add_epi8(xmm8, xmm4);
64  xmm3 = _mm_add_epi8(xmm3, xmm5);
65 
66  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
67  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
68 
69 
70  xmm3 = _mm_add_epi16(xmm0, xmm1);
71 
72 
73  _mm_store_si128(p_target, xmm3);
74 
75  p_target += 1;
76 
77  }
78 
79  for(i = 0; i < intermediate; ++i) {
80 
81  xmm0 = _mm_load_si128(p_src0);
82 
83 
84  xmm2 = _mm_xor_si128(xmm2, xmm2);
85  p_src0 += 1;
86 
87  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
88  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
89 
90  xmm8 = _mm_and_si128(xmm2, xmm6);
91 
92  xmm3 = _mm_add_epi8(xmm8, xmm4);
93 
94  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
95 
96  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
97 
98  p_target = (__m128i*)((int8_t*)p_target + 8);
99 
100  }
101 
102  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
103  target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
104  }
105 
106 
107 }
108 
109 #endif /*LV_HAVE_SSSE3*/
110 
111 
112 #ifdef LV_HAVE_GENERIC
113 static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
114 
115  int i = 0;
116 
117  int bound = num_bytes >> 1;
118 
119 
120  for(i = 0; i < bound; i += 2) {
121  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
122  }
123 
124 }
125 
126 
127 
128 #endif /*LV_HAVE_GENERIC*/
129 
130 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/