gr-baz Package
volk_16i_permute_and_scalar_add_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
2 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
3 
4 
5 #include<inttypes.h>
6 #include<stdio.h>
7 
8 
9 
10 
11 #ifdef LV_HAVE_SSE2
12 
13 #include<xmmintrin.h>
14 #include<emmintrin.h>
15 
16 static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
17 
18 
19  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
20 
21  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
22 
23  short* p_permute_indexes = permute_indexes;
24 
25  p_target = (__m128i*)target;
26  p_cntl0 = (__m128i*)cntl0;
27  p_cntl1 = (__m128i*)cntl1;
28  p_cntl2 = (__m128i*)cntl2;
29  p_cntl3 = (__m128i*)cntl3;
30  p_scalars = (__m128i*)scalars;
31 
32  int i = 0;
33 
34  int bound = (num_bytes >> 4);
35  int leftovers = (num_bytes >> 1) & 7;
36 
37  xmm0 = _mm_load_si128(p_scalars);
38 
39  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
40  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
41  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
42  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
43 
44  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
45  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
46  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
47  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
48 
49 
50  for(; i < bound; ++i) {
51  xmm0 = _mm_setzero_si128();
52  xmm5 = _mm_setzero_si128();
53  xmm6 = _mm_setzero_si128();
54  xmm7 = _mm_setzero_si128();
55 
56  xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
57  xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
58  xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
59  xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
60  xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
61  xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
62  xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
63  xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
64 
65  xmm0 = _mm_add_epi16(xmm0, xmm5);
66  xmm6 = _mm_add_epi16(xmm6, xmm7);
67 
68  p_permute_indexes += 8;
69 
70  xmm0 = _mm_add_epi16(xmm0, xmm6);
71 
72  xmm5 = _mm_load_si128(p_cntl0);
73  xmm6 = _mm_load_si128(p_cntl1);
74  xmm7 = _mm_load_si128(p_cntl2);
75 
76  xmm5 = _mm_and_si128(xmm5, xmm1);
77  xmm6 = _mm_and_si128(xmm6, xmm2);
78  xmm7 = _mm_and_si128(xmm7, xmm3);
79 
80  xmm0 = _mm_add_epi16(xmm0, xmm5);
81 
82  xmm5 = _mm_load_si128(p_cntl3);
83 
84  xmm6 = _mm_add_epi16(xmm6, xmm7);
85 
86  p_cntl0 += 1;
87 
88  xmm5 = _mm_and_si128(xmm5, xmm4);
89 
90  xmm0 = _mm_add_epi16(xmm0, xmm6);
91 
92  p_cntl1 += 1;
93  p_cntl2 += 1;
94 
95  xmm0 = _mm_add_epi16(xmm0, xmm5);
96 
97  p_cntl3 += 1;
98 
99  _mm_store_si128(p_target, xmm0);
100 
101  p_target += 1;
102  }
103 
104 
105 
106 
107 
108  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
109  target[i] = src0[permute_indexes[i]]
110  + (cntl0[i] & scalars[0])
111  + (cntl1[i] & scalars[1])
112  + (cntl2[i] & scalars[2])
113  + (cntl3[i] & scalars[3]);
114  }
115 }
116 #endif /*LV_HAVE_SSEs*/
117 
118 
119 #ifdef LV_HAVE_GENERIC
120 static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
121 
122  int i = 0;
123 
124  int bound = num_bytes >> 1;
125 
126  for(i = 0; i < bound; ++i) {
127  target[i] = src0[permute_indexes[i]]
128  + (cntl0[i] & scalars[0])
129  + (cntl1[i] & scalars[1])
130  + (cntl2[i] & scalars[2])
131  + (cntl3[i] & scalars[3]);
132 
133  }
134 }
135 
136 #endif /*LV_HAVE_GENERIC*/
137 
138 
139 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/