Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53#ifdef _MSC_VER
55#else
56} decision_t __attribute__((aligned(16)));
57#endif
58
59
60static inline void renormalize(unsigned char* X)
61{
62 int NUMSTATES = 64;
63 int i;
64
65 unsigned char min = X[0];
66 for (i = 0; i < NUMSTATES; i++)
67 if (min > X[i])
68 min = X[i];
69 for (i = 0; i < NUMSTATES; i++)
70 X[i] -= min;
71}
72
73
74// helper BFLY for GENERIC version
75static inline void BFLY(int i,
76 int s,
77 unsigned char* syms,
78 unsigned char* Y,
79 unsigned char* X,
80 decision_t* d,
81 unsigned char* Branchtab)
82{
83 int j;
84 unsigned int decision0, decision1;
85 unsigned char metric, m0, m1, m2, m3;
86 unsigned short metricsum;
87
88 int NUMSTATES = 64;
89 int RATE = 2;
90 int METRICSHIFT = 1;
91 int PRECISIONSHIFT = 2;
92
93 metricsum = 1;
94 for (j = 0; j < RATE; j++)
95 metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
96 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
97
98 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
99
100 m0 = X[i] + metric;
101 m1 = X[i + NUMSTATES / 2] + (max - metric);
102 m2 = X[i] + (max - metric);
103 m3 = X[i + NUMSTATES / 2] + metric;
104
105 decision0 = (signed int)(m0 - m1) >= 0;
106 decision1 = (signed int)(m2 - m3) >= 0;
107
108 Y[2 * i] = decision0 ? m1 : m0;
109 Y[2 * i + 1] = decision1 ? m3 : m2;
110
111 d->w[i / (sizeof(unsigned int) * 8 / 2) +
112 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
113 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
114}
115
116
117#if LV_HAVE_AVX2
118
119#include <immintrin.h>
120#include <stdio.h>
121
122static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
123 unsigned char* X,
124 unsigned char* syms,
125 unsigned char* dec,
126 unsigned int framebits,
127 unsigned int excess,
128 unsigned char* Branchtab)
129{
130 unsigned int i9;
131 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
132 unsigned char a75, a81;
133 int a73, a92;
134 int s20, s21;
135 unsigned char *a80, *b6;
136 int *a110, *a91, *a93;
137 __m256i *a112, *a71, *a72, *a77, *a83, *a95;
138 __m256i a86, a87;
139 __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
140 s18, s19, s22, s23, s24, s25, t13, t14, t15;
141 a71 = ((__m256i*)X);
142 s18 = *(a71);
143 a72 = (a71 + 1);
144 s19 = *(a72);
145 a73 = (4 * i9);
146 b6 = (syms + a73);
147 a75 = *(b6);
148 a76 = _mm256_set1_epi8(a75);
149 a77 = ((__m256i*)Branchtab);
150 a78 = *(a77);
151 a79 = _mm256_xor_si256(a76, a78);
152 a80 = (b6 + 1);
153 a81 = *(a80);
154 a82 = _mm256_set1_epi8(a81);
155 a83 = (a77 + 1);
156 a84 = *(a83);
157 a85 = _mm256_xor_si256(a82, a84);
158 t13 = _mm256_avg_epu8(a79, a85);
159 a86 = ((__m256i)t13);
160 a87 = _mm256_srli_epi16(a86, 2);
161 a88 = ((__m256i)a87);
162 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
163 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
164 m23 = _mm256_adds_epu8(s18, t14);
165 m24 = _mm256_adds_epu8(s19, t15);
166 m25 = _mm256_adds_epu8(s18, t15);
167 m26 = _mm256_adds_epu8(s19, t14);
168 a89 = _mm256_min_epu8(m24, m23);
169 d9 = _mm256_cmpeq_epi8(a89, m24);
170 a90 = _mm256_min_epu8(m26, m25);
171 d10 = _mm256_cmpeq_epi8(a90, m26);
172 s22 = _mm256_unpacklo_epi8(d9, d10);
173 s23 = _mm256_unpackhi_epi8(d9, d10);
174 s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
175 a91 = ((int*)dec);
176 a92 = (4 * i9);
177 a93 = (a91 + a92);
178 *(a93) = s20;
179 s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
180 a110 = (a93 + 1);
181 *(a110) = s21;
182 s22 = _mm256_unpacklo_epi8(a89, a90);
183 s23 = _mm256_unpackhi_epi8(a89, a90);
184 a95 = ((__m256i*)Y);
185 s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
186 *(a95) = s24;
187 s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
188 a112 = (a95 + 1);
189 *(a112) = s23;
190
191 __m256i m5, m6;
192 m5 = ((__m256i*)Y)[0];
193 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
194 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
195 __m256i m7;
196 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
197 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
198 ((__m256i)m7)));
199 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
200 ((__m256i)m7)));
201 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
202 ((__m256i)m7)));
203 m7 = _mm256_unpacklo_epi8(m7, m7);
204 m7 = _mm256_shufflelo_epi16(m7, 0);
205 m6 = _mm256_unpacklo_epi64(m7, m7);
206 m6 = _mm256_permute2x128_si256(
207 m6, m6, 0); // copy lower half of m6 to upper half, since above ops
208 // operate on 128 bit lanes
209 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
210 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
211
212 unsigned char a188, a194;
213 int a205;
214 int s48, s54;
215 unsigned char *a187, *a193;
216 int *a204, *a206, *a223, *b16;
217 __m256i *a184, *a185, *a190, *a196, *a208, *a225;
218 __m256i a199, a200;
219 __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
220 m41, m42, s46, s47, s50, s51, t25, t26, t27;
221 a184 = ((__m256i*)Y);
222 s46 = *(a184);
223 a185 = (a184 + 1);
224 s47 = *(a185);
225 a187 = (b6 + 2);
226 a188 = *(a187);
227 a189 = _mm256_set1_epi8(a188);
228 a190 = ((__m256i*)Branchtab);
229 a191 = *(a190);
230 a192 = _mm256_xor_si256(a189, a191);
231 a193 = (b6 + 3);
232 a194 = *(a193);
233 a195 = _mm256_set1_epi8(a194);
234 a196 = (a190 + 1);
235 a197 = *(a196);
236 a198 = _mm256_xor_si256(a195, a197);
237 t25 = _mm256_avg_epu8(a192, a198);
238 a199 = ((__m256i)t25);
239 a200 = _mm256_srli_epi16(a199, 2);
240 a201 = ((__m256i)a200);
241 t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
242 t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
243 m39 = _mm256_adds_epu8(s46, t26);
244 m40 = _mm256_adds_epu8(s47, t27);
245 m41 = _mm256_adds_epu8(s46, t27);
246 m42 = _mm256_adds_epu8(s47, t26);
247 a202 = _mm256_min_epu8(m40, m39);
248 d17 = _mm256_cmpeq_epi8(a202, m40);
249 a203 = _mm256_min_epu8(m42, m41);
250 d18 = _mm256_cmpeq_epi8(a203, m42);
251 s24 = _mm256_unpacklo_epi8(d17, d18);
252 s25 = _mm256_unpackhi_epi8(d17, d18);
253 s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
254 a204 = ((int*)dec);
255 a205 = (4 * i9);
256 b16 = (a204 + a205);
257 a206 = (b16 + 2);
258 *(a206) = s48;
259 s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
260 a223 = (b16 + 3);
261 *(a223) = s54;
262 s50 = _mm256_unpacklo_epi8(a202, a203);
263 s51 = _mm256_unpackhi_epi8(a202, a203);
264 s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
265 s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
266 a208 = ((__m256i*)X);
267 *(a208) = s25;
268 a225 = (a208 + 1);
269 *(a225) = s51;
270
271 __m256i m12, m13;
272 m12 = ((__m256i*)X)[0];
273 m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
274 m12 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m12, m12, 0x21), m12));
275 __m256i m14;
276 m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
277 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
278 ((__m256i)m14)));
279 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
280 ((__m256i)m14)));
281 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
282 ((__m256i)m14)));
283 m14 = _mm256_unpacklo_epi8(m14, m14);
284 m14 = _mm256_shufflelo_epi16(m14, 0);
285 m13 = _mm256_unpacklo_epi64(m14, m14);
286 m13 = _mm256_permute2x128_si256(m13, m13, 0);
287 ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
288 ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
289 }
290
291 renormalize(X);
292
293 unsigned int j;
294 for (j = 0; j < (framebits + excess) % 2; ++j) {
295 int i;
296 for (i = 0; i < 64 / 2; i++) {
297 BFLY(i,
298 (((framebits + excess) >> 1) << 1) + j,
299 syms,
300 Y,
301 X,
302 (decision_t*)dec,
303 Branchtab);
304 }
305
306 renormalize(Y);
307 }
308 /*skip*/
309}
310
311#endif /*LV_HAVE_AVX2*/
312
313
314#if LV_HAVE_SSE3
315
316#include <emmintrin.h>
317#include <mmintrin.h>
318#include <pmmintrin.h>
319#include <stdio.h>
320#include <xmmintrin.h>
321
322static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
323 unsigned char* X,
324 unsigned char* syms,
325 unsigned char* dec,
326 unsigned int framebits,
327 unsigned int excess,
328 unsigned char* Branchtab)
329{
330 unsigned int i9;
331 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
332 unsigned char a75, a81;
333 int a73, a92;
334 short int s20, s21, s26, s27;
335 unsigned char *a74, *a80, *b6;
336 short int *a110, *a111, *a91, *a93, *a94;
337 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
338 __m128i a105, a106, a86, a87;
339 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
340 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
341 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
342 a71 = ((__m128i*)X);
343 s18 = *(a71);
344 a72 = (a71 + 2);
345 s19 = *(a72);
346 a73 = (4 * i9);
347 a74 = (syms + a73);
348 a75 = *(a74);
349 a76 = _mm_set1_epi8(a75);
350 a77 = ((__m128i*)Branchtab);
351 a78 = *(a77);
352 a79 = _mm_xor_si128(a76, a78);
353 b6 = (a73 + syms);
354 a80 = (b6 + 1);
355 a81 = *(a80);
356 a82 = _mm_set1_epi8(a81);
357 a83 = (a77 + 2);
358 a84 = *(a83);
359 a85 = _mm_xor_si128(a82, a84);
360 t13 = _mm_avg_epu8(a79, a85);
361 a86 = ((__m128i)t13);
362 a87 = _mm_srli_epi16(a86, 2);
363 a88 = ((__m128i)a87);
364 t14 = _mm_and_si128(
365 a88,
366 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
367 t15 = _mm_subs_epu8(
368 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
369 t14);
370 m23 = _mm_adds_epu8(s18, t14);
371 m24 = _mm_adds_epu8(s19, t15);
372 m25 = _mm_adds_epu8(s18, t15);
373 m26 = _mm_adds_epu8(s19, t14);
374 a89 = _mm_min_epu8(m24, m23);
375 d9 = _mm_cmpeq_epi8(a89, m24);
376 a90 = _mm_min_epu8(m26, m25);
377 d10 = _mm_cmpeq_epi8(a90, m26);
378 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
379 a91 = ((short int*)dec);
380 a92 = (8 * i9);
381 a93 = (a91 + a92);
382 *(a93) = s20;
383 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
384 a94 = (a93 + 1);
385 *(a94) = s21;
386 s22 = _mm_unpacklo_epi8(a89, a90);
387 s23 = _mm_unpackhi_epi8(a89, a90);
388 a95 = ((__m128i*)Y);
389 *(a95) = s22;
390 a96 = (a95 + 1);
391 *(a96) = s23;
392 a97 = (a71 + 1);
393 s24 = *(a97);
394 a98 = (a71 + 3);
395 s25 = *(a98);
396 a99 = (a77 + 1);
397 a100 = *(a99);
398 a101 = _mm_xor_si128(a76, a100);
399 a102 = (a77 + 3);
400 a103 = *(a102);
401 a104 = _mm_xor_si128(a82, a103);
402 t16 = _mm_avg_epu8(a101, a104);
403 a105 = ((__m128i)t16);
404 a106 = _mm_srli_epi16(a105, 2);
405 a107 = ((__m128i)a106);
406 t17 = _mm_and_si128(
407 a107,
408 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
409 t18 = _mm_subs_epu8(
410 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
411 t17);
412 m27 = _mm_adds_epu8(s24, t17);
413 m28 = _mm_adds_epu8(s25, t18);
414 m29 = _mm_adds_epu8(s24, t18);
415 m30 = _mm_adds_epu8(s25, t17);
416 a108 = _mm_min_epu8(m28, m27);
417 d11 = _mm_cmpeq_epi8(a108, m28);
418 a109 = _mm_min_epu8(m30, m29);
419 d12 = _mm_cmpeq_epi8(a109, m30);
420 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
421 a110 = (a93 + 2);
422 *(a110) = s26;
423 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
424 a111 = (a93 + 3);
425 *(a111) = s27;
426 s28 = _mm_unpacklo_epi8(a108, a109);
427 s29 = _mm_unpackhi_epi8(a108, a109);
428 a112 = (a95 + 2);
429 *(a112) = s28;
430 a113 = (a95 + 3);
431 *(a113) = s29;
432
433 __m128i m5, m6;
434 m5 = ((__m128i*)Y)[0];
435 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
436 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
437 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
438 __m128i m7;
439 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
440 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
441 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
442 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
443 m7 = _mm_unpacklo_epi8(m7, m7);
444 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
445 m6 = _mm_unpacklo_epi64(m7, m7);
446 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
447 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
448 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
449 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
450
451 unsigned char a188, a194;
452 int a186, a205;
453 short int s48, s49, s54, s55;
454 unsigned char *a187, *a193, *b15;
455 short int *a204, *a206, *a207, *a223, *a224, *b16;
456 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
457 *a225, *a226;
458 __m128i a199, a200, a218, a219;
459 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
460 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
461 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
462 a184 = ((__m128i*)Y);
463 s46 = *(a184);
464 a185 = (a184 + 2);
465 s47 = *(a185);
466 a186 = (4 * i9);
467 b15 = (a186 + syms);
468 a187 = (b15 + 2);
469 a188 = *(a187);
470 a189 = _mm_set1_epi8(a188);
471 a190 = ((__m128i*)Branchtab);
472 a191 = *(a190);
473 a192 = _mm_xor_si128(a189, a191);
474 a193 = (b15 + 3);
475 a194 = *(a193);
476 a195 = _mm_set1_epi8(a194);
477 a196 = (a190 + 2);
478 a197 = *(a196);
479 a198 = _mm_xor_si128(a195, a197);
480 t25 = _mm_avg_epu8(a192, a198);
481 a199 = ((__m128i)t25);
482 a200 = _mm_srli_epi16(a199, 2);
483 a201 = ((__m128i)a200);
484 t26 = _mm_and_si128(
485 a201,
486 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
487 t27 = _mm_subs_epu8(
488 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
489 t26);
490 m39 = _mm_adds_epu8(s46, t26);
491 m40 = _mm_adds_epu8(s47, t27);
492 m41 = _mm_adds_epu8(s46, t27);
493 m42 = _mm_adds_epu8(s47, t26);
494 a202 = _mm_min_epu8(m40, m39);
495 d17 = _mm_cmpeq_epi8(a202, m40);
496 a203 = _mm_min_epu8(m42, m41);
497 d18 = _mm_cmpeq_epi8(a203, m42);
498 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
499 a204 = ((short int*)dec);
500 a205 = (8 * i9);
501 b16 = (a204 + a205);
502 a206 = (b16 + 4);
503 *(a206) = s48;
504 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
505 a207 = (b16 + 5);
506 *(a207) = s49;
507 s50 = _mm_unpacklo_epi8(a202, a203);
508 s51 = _mm_unpackhi_epi8(a202, a203);
509 a208 = ((__m128i*)X);
510 *(a208) = s50;
511 a209 = (a208 + 1);
512 *(a209) = s51;
513 a210 = (a184 + 1);
514 s52 = *(a210);
515 a211 = (a184 + 3);
516 s53 = *(a211);
517 a212 = (a190 + 1);
518 a213 = *(a212);
519 a214 = _mm_xor_si128(a189, a213);
520 a215 = (a190 + 3);
521 a216 = *(a215);
522 a217 = _mm_xor_si128(a195, a216);
523 t28 = _mm_avg_epu8(a214, a217);
524 a218 = ((__m128i)t28);
525 a219 = _mm_srli_epi16(a218, 2);
526 a220 = ((__m128i)a219);
527 t29 = _mm_and_si128(
528 a220,
529 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
530 t30 = _mm_subs_epu8(
531 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
532 t29);
533 m43 = _mm_adds_epu8(s52, t29);
534 m44 = _mm_adds_epu8(s53, t30);
535 m45 = _mm_adds_epu8(s52, t30);
536 m46 = _mm_adds_epu8(s53, t29);
537 a221 = _mm_min_epu8(m44, m43);
538 d19 = _mm_cmpeq_epi8(a221, m44);
539 a222 = _mm_min_epu8(m46, m45);
540 d20 = _mm_cmpeq_epi8(a222, m46);
541 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
542 a223 = (b16 + 6);
543 *(a223) = s54;
544 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
545 a224 = (b16 + 7);
546 *(a224) = s55;
547 s56 = _mm_unpacklo_epi8(a221, a222);
548 s57 = _mm_unpackhi_epi8(a221, a222);
549 a225 = (a208 + 2);
550 *(a225) = s56;
551 a226 = (a208 + 3);
552 *(a226) = s57;
553
554 __m128i m12, m13;
555 m12 = ((__m128i*)X)[0];
556 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
557 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
558 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
559 __m128i m14;
560 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
561 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
562 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
563 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
564 m14 = _mm_unpacklo_epi8(m14, m14);
565 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
566 m13 = _mm_unpacklo_epi64(m14, m14);
567 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
568 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
569 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
570 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
571 }
572
573 renormalize(X);
574
575 /*int ch;
576 for(ch = 0; ch < 64; ch++) {
577 printf("%d,", X[ch]);
578 }
579 printf("\n");*/
580
581 unsigned int j;
582 for (j = 0; j < (framebits + excess) % 2; ++j) {
583 int i;
584 for (i = 0; i < 64 / 2; i++) {
585 BFLY(i,
586 (((framebits + excess) >> 1) << 1) + j,
587 syms,
588 Y,
589 X,
590 (decision_t*)dec,
591 Branchtab);
592 }
593
594
595 renormalize(Y);
596
597 /*printf("\n");
598 for(ch = 0; ch < 64; ch++) {
599 printf("%d,", Y[ch]);
600 }
601 printf("\n");*/
602 }
603 /*skip*/
604}
605
606#endif /*LV_HAVE_SSE3*/
607
608#if LV_HAVE_NEON
609
611
612static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
613 unsigned char* X,
614 unsigned char* syms,
615 unsigned char* dec,
616 unsigned int framebits,
617 unsigned int excess,
618 unsigned char* Branchtab)
619{
620 unsigned int i9;
621 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
622 unsigned char a75, a81;
623 int a73, a92;
624 short int s20, s21, s26, s27;
625 unsigned char *a74, *a80, *b6;
626 short int *a110, *a111, *a91, *a93, *a94;
627 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
628 __m128i a105, a106, a86, a87;
629 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
630 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
631 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
632 a71 = ((__m128i*)X);
633 s18 = *(a71);
634 a72 = (a71 + 2);
635 s19 = *(a72);
636 a73 = (4 * i9);
637 a74 = (syms + a73);
638 a75 = *(a74);
639 a76 = _mm_set1_epi8(a75);
640 a77 = ((__m128i*)Branchtab);
641 a78 = *(a77);
642 a79 = _mm_xor_si128(a76, a78);
643 b6 = (a73 + syms);
644 a80 = (b6 + 1);
645 a81 = *(a80);
646 a82 = _mm_set1_epi8(a81);
647 a83 = (a77 + 2);
648 a84 = *(a83);
649 a85 = _mm_xor_si128(a82, a84);
650 t13 = _mm_avg_epu8(a79, a85);
651 a86 = ((__m128i)t13);
652 a87 = _mm_srli_epi16(a86, 2);
653 a88 = ((__m128i)a87);
654 t14 = _mm_and_si128(
655 a88,
656 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
657 t15 = _mm_subs_epu8(
658 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
659 t14);
660 m23 = _mm_adds_epu8(s18, t14);
661 m24 = _mm_adds_epu8(s19, t15);
662 m25 = _mm_adds_epu8(s18, t15);
663 m26 = _mm_adds_epu8(s19, t14);
664 a89 = _mm_min_epu8(m24, m23);
665 d9 = _mm_cmpeq_epi8(a89, m24);
666 a90 = _mm_min_epu8(m26, m25);
667 d10 = _mm_cmpeq_epi8(a90, m26);
668 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
669 a91 = ((short int*)dec);
670 a92 = (8 * i9);
671 a93 = (a91 + a92);
672 *(a93) = s20;
673 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
674 a94 = (a93 + 1);
675 *(a94) = s21;
676 s22 = _mm_unpacklo_epi8(a89, a90);
677 s23 = _mm_unpackhi_epi8(a89, a90);
678 a95 = ((__m128i*)Y);
679 *(a95) = s22;
680 a96 = (a95 + 1);
681 *(a96) = s23;
682 a97 = (a71 + 1);
683 s24 = *(a97);
684 a98 = (a71 + 3);
685 s25 = *(a98);
686 a99 = (a77 + 1);
687 a100 = *(a99);
688 a101 = _mm_xor_si128(a76, a100);
689 a102 = (a77 + 3);
690 a103 = *(a102);
691 a104 = _mm_xor_si128(a82, a103);
692 t16 = _mm_avg_epu8(a101, a104);
693 a105 = ((__m128i)t16);
694 a106 = _mm_srli_epi16(a105, 2);
695 a107 = ((__m128i)a106);
696 t17 = _mm_and_si128(
697 a107,
698 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
699 t18 = _mm_subs_epu8(
700 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
701 t17);
702 m27 = _mm_adds_epu8(s24, t17);
703 m28 = _mm_adds_epu8(s25, t18);
704 m29 = _mm_adds_epu8(s24, t18);
705 m30 = _mm_adds_epu8(s25, t17);
706 a108 = _mm_min_epu8(m28, m27);
707 d11 = _mm_cmpeq_epi8(a108, m28);
708 a109 = _mm_min_epu8(m30, m29);
709 d12 = _mm_cmpeq_epi8(a109, m30);
710 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
711 a110 = (a93 + 2);
712 *(a110) = s26;
713 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
714 a111 = (a93 + 3);
715 *(a111) = s27;
716 s28 = _mm_unpacklo_epi8(a108, a109);
717 s29 = _mm_unpackhi_epi8(a108, a109);
718 a112 = (a95 + 2);
719 *(a112) = s28;
720 a113 = (a95 + 3);
721 *(a113) = s29;
722
723 __m128i m5, m6;
724 m5 = ((__m128i*)Y)[0];
725 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
726 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
727 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
728 __m128i m7;
729 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
730 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
731 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
732 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
733 m7 = _mm_unpacklo_epi8(m7, m7);
734 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
735 m6 = _mm_unpacklo_epi64(m7, m7);
736 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
737 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
738 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
739 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
740
741 unsigned char a188, a194;
742 int a186, a205;
743 short int s48, s49, s54, s55;
744 unsigned char *a187, *a193, *b15;
745 short int *a204, *a206, *a207, *a223, *a224, *b16;
746 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
747 *a225, *a226;
748 __m128i a199, a200, a218, a219;
749 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
750 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
751 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
752 a184 = ((__m128i*)Y);
753 s46 = *(a184);
754 a185 = (a184 + 2);
755 s47 = *(a185);
756 a186 = (4 * i9);
757 b15 = (a186 + syms);
758 a187 = (b15 + 2);
759 a188 = *(a187);
760 a189 = _mm_set1_epi8(a188);
761 a190 = ((__m128i*)Branchtab);
762 a191 = *(a190);
763 a192 = _mm_xor_si128(a189, a191);
764 a193 = (b15 + 3);
765 a194 = *(a193);
766 a195 = _mm_set1_epi8(a194);
767 a196 = (a190 + 2);
768 a197 = *(a196);
769 a198 = _mm_xor_si128(a195, a197);
770 t25 = _mm_avg_epu8(a192, a198);
771 a199 = ((__m128i)t25);
772 a200 = _mm_srli_epi16(a199, 2);
773 a201 = ((__m128i)a200);
774 t26 = _mm_and_si128(
775 a201,
776 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
777 t27 = _mm_subs_epu8(
778 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
779 t26);
780 m39 = _mm_adds_epu8(s46, t26);
781 m40 = _mm_adds_epu8(s47, t27);
782 m41 = _mm_adds_epu8(s46, t27);
783 m42 = _mm_adds_epu8(s47, t26);
784 a202 = _mm_min_epu8(m40, m39);
785 d17 = _mm_cmpeq_epi8(a202, m40);
786 a203 = _mm_min_epu8(m42, m41);
787 d18 = _mm_cmpeq_epi8(a203, m42);
788 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
789 a204 = ((short int*)dec);
790 a205 = (8 * i9);
791 b16 = (a204 + a205);
792 a206 = (b16 + 4);
793 *(a206) = s48;
794 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
795 a207 = (b16 + 5);
796 *(a207) = s49;
797 s50 = _mm_unpacklo_epi8(a202, a203);
798 s51 = _mm_unpackhi_epi8(a202, a203);
799 a208 = ((__m128i*)X);
800 *(a208) = s50;
801 a209 = (a208 + 1);
802 *(a209) = s51;
803 a210 = (a184 + 1);
804 s52 = *(a210);
805 a211 = (a184 + 3);
806 s53 = *(a211);
807 a212 = (a190 + 1);
808 a213 = *(a212);
809 a214 = _mm_xor_si128(a189, a213);
810 a215 = (a190 + 3);
811 a216 = *(a215);
812 a217 = _mm_xor_si128(a195, a216);
813 t28 = _mm_avg_epu8(a214, a217);
814 a218 = ((__m128i)t28);
815 a219 = _mm_srli_epi16(a218, 2);
816 a220 = ((__m128i)a219);
817 t29 = _mm_and_si128(
818 a220,
819 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
820 t30 = _mm_subs_epu8(
821 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
822 t29);
823 m43 = _mm_adds_epu8(s52, t29);
824 m44 = _mm_adds_epu8(s53, t30);
825 m45 = _mm_adds_epu8(s52, t30);
826 m46 = _mm_adds_epu8(s53, t29);
827 a221 = _mm_min_epu8(m44, m43);
828 d19 = _mm_cmpeq_epi8(a221, m44);
829 a222 = _mm_min_epu8(m46, m45);
830 d20 = _mm_cmpeq_epi8(a222, m46);
831 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
832 a223 = (b16 + 6);
833 *(a223) = s54;
834 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
835 a224 = (b16 + 7);
836 *(a224) = s55;
837 s56 = _mm_unpacklo_epi8(a221, a222);
838 s57 = _mm_unpackhi_epi8(a221, a222);
839 a225 = (a208 + 2);
840 *(a225) = s56;
841 a226 = (a208 + 3);
842 *(a226) = s57;
843
844 __m128i m12, m13;
845 m12 = ((__m128i*)X)[0];
846 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
847 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
848 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
849 __m128i m14;
850 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
851 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
852 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
853 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
854 m14 = _mm_unpacklo_epi8(m14, m14);
855 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
856 m13 = _mm_unpacklo_epi64(m14, m14);
857 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
858 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
859 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
860 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
861 }
862
863 renormalize(X);
864
865 /*int ch;
866 for(ch = 0; ch < 64; ch++) {
867 printf("%d,", X[ch]);
868 }
869 printf("\n");*/
870
871 unsigned int j;
872 for (j = 0; j < (framebits + excess) % 2; ++j) {
873 int i;
874 for (i = 0; i < 64 / 2; i++) {
875 BFLY(i,
876 (((framebits + excess) >> 1) << 1) + j,
877 syms,
878 Y,
879 X,
880 (decision_t*)dec,
881 Branchtab);
882 }
883
884
885 renormalize(Y);
886
887 /*printf("\n");
888 for(ch = 0; ch < 64; ch++) {
889 printf("%d,", Y[ch]);
890 }
891 printf("\n");*/
892 }
893 /*skip*/
894}
895
896#endif /*LV_HAVE_NEON*/
897
898#if LV_HAVE_GENERIC
899
900static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
901 unsigned char* X,
902 unsigned char* syms,
903 unsigned char* dec,
904 unsigned int framebits,
905 unsigned int excess,
906 unsigned char* Branchtab)
907{
908 int nbits = framebits + excess;
909 int NUMSTATES = 64;
910
911 int s, i;
912 for (s = 0; s < nbits; s++) {
913 void* tmp;
914 for (i = 0; i < NUMSTATES / 2; i++) {
915 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
916 }
917
918 renormalize(Y);
919
921 tmp = (void*)X;
922 X = Y;
923 Y = (unsigned char*)tmp;
924 }
925}
926
927#endif /* LV_HAVE_GENERIC */
928
929#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition sse2neon.h:4863
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5840
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3016
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition sse2neon.h:4540
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3034
#define _mm_srli_epi64(a, imm)
Definition sse2neon.h:5387
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:4948
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5760
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition sse2neon.h:5880
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3186
#define _mm_shufflelo_epi16(a, imm)
Definition sse2neon.h:5157
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4469
#define _mm_srli_epi16(a, imm)
Definition sse2neon.h:5362
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition sse2neon.h:5679
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition sse2neon.h:306
int64x2_t __m128i
Definition sse2neon.h:375
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3072
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5825
#define _mm_srli_si128(a, imm)
Definition sse2neon.h:5399
Definition volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:75
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:322
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:612
static void renormalize(unsigned char *X)
Definition volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:900
for i
Definition volk_config_fixed.tmpl.h:13