Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
25  */
26 
27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
29 #include <string.h>
30 
31 static inline unsigned int log2_of_power_of_2(unsigned int val)
32 {
33  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
34  static const unsigned int b[] = {
35  0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
36  };
37 
38  unsigned int res = (val & b[0]) != 0;
39  res |= ((val & b[4]) != 0) << 4;
40  res |= ((val & b[3]) != 0) << 3;
41  res |= ((val & b[2]) != 0) << 2;
42  res |= ((val & b[1]) != 0) << 1;
43  return res;
44 }
45 
46 static inline void encodepolar_single_stage(unsigned char* frame_ptr,
47  const unsigned char* temp_ptr,
48  const unsigned int num_branches,
49  const unsigned int frame_half)
50 {
51  unsigned int branch, bit;
52  for (branch = 0; branch < num_branches; ++branch) {
53  for (bit = 0; bit < frame_half; ++bit) {
54  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
55  *(frame_ptr + frame_half) = *(temp_ptr + 1);
56  ++frame_ptr;
57  temp_ptr += 2;
58  }
59  frame_ptr += frame_half;
60  }
61 }
62 
63 #ifdef LV_HAVE_GENERIC
64 
65 static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
66  unsigned char* temp,
67  unsigned int frame_size)
68 {
69  unsigned int stage = log2_of_power_of_2(frame_size);
70  unsigned int frame_half = frame_size >> 1;
71  unsigned int num_branches = 1;
72 
73  while (stage) {
74  // encode stage
75  encodepolar_single_stage(frame, temp, num_branches, frame_half);
76  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
77 
78  // update all the parameters.
79  num_branches = num_branches << 1;
80  frame_half = frame_half >> 1;
81  --stage;
82  }
83 }
84 #endif /* LV_HAVE_GENERIC */
85 
86 #ifdef LV_HAVE_SSSE3
87 #include <tmmintrin.h>
88 
89 static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
90  unsigned char* temp,
91  unsigned int frame_size)
92 {
93  const unsigned int po2 = log2_of_power_of_2(frame_size);
94 
95  unsigned int stage = po2;
96  unsigned char* frame_ptr = frame;
97  unsigned char* temp_ptr = temp;
98 
99  unsigned int frame_half = frame_size >> 1;
100  unsigned int num_branches = 1;
101  unsigned int branch;
102  unsigned int bit;
103 
104  // prepare constants
105  const __m128i mask_stage1 = _mm_set_epi8(0x0,
106  0xFF,
107  0x0,
108  0xFF,
109  0x0,
110  0xFF,
111  0x0,
112  0xFF,
113  0x0,
114  0xFF,
115  0x0,
116  0xFF,
117  0x0,
118  0xFF,
119  0x0,
120  0xFF);
121 
122  // get some SIMD registers to play with.
123  __m128i r_frame0, r_temp0, shifted;
124 
125  {
126  __m128i r_frame1, r_temp1;
127  const __m128i shuffle_separate =
128  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
129 
130  while (stage > 4) {
131  frame_ptr = frame;
132  temp_ptr = temp;
133 
134  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
135  for (branch = 0; branch < num_branches; ++branch) {
136  for (bit = 0; bit < frame_half; bit += 16) {
137  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
138  temp_ptr += 16;
139  r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
140  temp_ptr += 16;
141 
142  shifted = _mm_srli_si128(r_temp0, 1);
143  shifted = _mm_and_si128(shifted, mask_stage1);
144  r_temp0 = _mm_xor_si128(shifted, r_temp0);
145  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
146 
147  shifted = _mm_srli_si128(r_temp1, 1);
148  shifted = _mm_and_si128(shifted, mask_stage1);
149  r_temp1 = _mm_xor_si128(shifted, r_temp1);
150  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
151 
152  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
153  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
154 
155  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
156  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
157  frame_ptr += 16;
158  }
159 
160  frame_ptr += frame_half;
161  }
162  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
163 
164  num_branches = num_branches << 1;
165  frame_half = frame_half >> 1;
166  stage--;
167  }
168  }
169 
170  // This last part requires at least 16-bit frames.
171  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
172 
173  // reset pointers to correct positions.
174  frame_ptr = frame;
175  temp_ptr = temp;
176 
177  // prefetch first chunk
178  __VOLK_PREFETCH(temp_ptr);
179 
180  const __m128i shuffle_stage4 =
181  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
182  const __m128i mask_stage4 = _mm_set_epi8(0x0,
183  0x0,
184  0x0,
185  0x0,
186  0x0,
187  0x0,
188  0x0,
189  0x0,
190  0xFF,
191  0xFF,
192  0xFF,
193  0xFF,
194  0xFF,
195  0xFF,
196  0xFF,
197  0xFF);
198  const __m128i mask_stage3 = _mm_set_epi8(0x0,
199  0x0,
200  0x0,
201  0x0,
202  0xFF,
203  0xFF,
204  0xFF,
205  0xFF,
206  0x0,
207  0x0,
208  0x0,
209  0x0,
210  0xFF,
211  0xFF,
212  0xFF,
213  0xFF);
214  const __m128i mask_stage2 = _mm_set_epi8(0x0,
215  0x0,
216  0xFF,
217  0xFF,
218  0x0,
219  0x0,
220  0xFF,
221  0xFF,
222  0x0,
223  0x0,
224  0xFF,
225  0xFF,
226  0x0,
227  0x0,
228  0xFF,
229  0xFF);
230 
231  for (branch = 0; branch < num_branches; ++branch) {
232  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
233 
234  // prefetch next chunk
235  temp_ptr += 16;
236  __VOLK_PREFETCH(temp_ptr);
237 
238  // shuffle once for bit-reversal.
239  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
240 
241  shifted = _mm_srli_si128(r_temp0, 8);
242  shifted = _mm_and_si128(shifted, mask_stage4);
243  r_frame0 = _mm_xor_si128(shifted, r_temp0);
244 
245  shifted = _mm_srli_si128(r_frame0, 4);
246  shifted = _mm_and_si128(shifted, mask_stage3);
247  r_frame0 = _mm_xor_si128(shifted, r_frame0);
248 
249  shifted = _mm_srli_si128(r_frame0, 2);
250  shifted = _mm_and_si128(shifted, mask_stage2);
251  r_frame0 = _mm_xor_si128(shifted, r_frame0);
252 
253  shifted = _mm_srli_si128(r_frame0, 1);
254  shifted = _mm_and_si128(shifted, mask_stage1);
255  r_frame0 = _mm_xor_si128(shifted, r_frame0);
256 
257  // store result of chunk.
258  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
259  frame_ptr += 16;
260  }
261 }
262 
263 #endif /* LV_HAVE_SSSE3 */
264 
265 #ifdef LV_HAVE_AVX2
266 #include <immintrin.h>
267 
268 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
269  unsigned char* temp,
270  unsigned int frame_size)
271 {
272  const unsigned int po2 = log2_of_power_of_2(frame_size);
273 
274  unsigned int stage = po2;
275  unsigned char* frame_ptr = frame;
276  unsigned char* temp_ptr = temp;
277 
278  unsigned int frame_half = frame_size >> 1;
279  unsigned int num_branches = 1;
280  unsigned int branch;
281  unsigned int bit;
282 
283  // prepare constants
284  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
285  0xFF,
286  0x0,
287  0xFF,
288  0x0,
289  0xFF,
290  0x0,
291  0xFF,
292  0x0,
293  0xFF,
294  0x0,
295  0xFF,
296  0x0,
297  0xFF,
298  0x0,
299  0xFF,
300  0x0,
301  0xFF,
302  0x0,
303  0xFF,
304  0x0,
305  0xFF,
306  0x0,
307  0xFF,
308  0x0,
309  0xFF,
310  0x0,
311  0xFF,
312  0x0,
313  0xFF,
314  0x0,
315  0xFF);
316 
317  const __m128i mask_stage0 = _mm_set_epi8(0x0,
318  0xFF,
319  0x0,
320  0xFF,
321  0x0,
322  0xFF,
323  0x0,
324  0xFF,
325  0x0,
326  0xFF,
327  0x0,
328  0xFF,
329  0x0,
330  0xFF,
331  0x0,
332  0xFF);
333  // get some SIMD registers to play with.
334  __m256i r_frame0, r_temp0, shifted;
335  __m128i r_temp2, r_frame2, shifted2;
336  {
337  __m256i r_frame1, r_temp1;
338  __m128i r_frame3, r_temp3;
339  const __m256i shuffle_separate = _mm256_setr_epi8(0,
340  2,
341  4,
342  6,
343  8,
344  10,
345  12,
346  14,
347  1,
348  3,
349  5,
350  7,
351  9,
352  11,
353  13,
354  15,
355  0,
356  2,
357  4,
358  6,
359  8,
360  10,
361  12,
362  14,
363  1,
364  3,
365  5,
366  7,
367  9,
368  11,
369  13,
370  15);
371  const __m128i shuffle_separate128 =
372  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
373 
374  while (stage > 4) {
375  frame_ptr = frame;
376  temp_ptr = temp;
377 
378  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
379  for (branch = 0; branch < num_branches; ++branch) {
380  for (bit = 0; bit < frame_half; bit += 32) {
381  if ((frame_half - bit) <
382  32) // if only 16 bits remaining in frame, not 32
383  {
384  r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
385  temp_ptr += 16;
386  r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
387  temp_ptr += 16;
388 
389  shifted2 = _mm_srli_si128(r_temp2, 1);
390  shifted2 = _mm_and_si128(shifted2, mask_stage0);
391  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
392  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
393 
394  shifted2 = _mm_srli_si128(r_temp3, 1);
395  shifted2 = _mm_and_si128(shifted2, mask_stage0);
396  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
397  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
398 
399  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
400  _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
401 
402  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
403  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
404  frame_ptr += 16;
405  break;
406  }
407  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
408  temp_ptr += 32;
409  r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
410  temp_ptr += 32;
411 
412  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
413  shifted = _mm256_and_si256(shifted, mask_stage1);
414  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
415  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
416 
417  shifted = _mm256_srli_si256(r_temp1, 1);
418  shifted = _mm256_and_si256(shifted, mask_stage1);
419  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
420  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
421 
422  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
423  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
424  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
425  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
426 
427  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
428 
429  _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430  frame_ptr += 32;
431  }
432 
433  frame_ptr += frame_half;
434  }
435  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
436 
437  num_branches = num_branches << 1;
438  frame_half = frame_half >> 1;
439  stage--;
440  }
441  }
442 
443  // This last part requires at least 32-bit frames.
444  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
445 
446  // reset pointers to correct positions.
447  frame_ptr = frame;
448  temp_ptr = temp;
449 
450  // prefetch first chunk
451  __VOLK_PREFETCH(temp_ptr);
452 
453  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
454  8,
455  4,
456  12,
457  2,
458  10,
459  6,
460  14,
461  1,
462  9,
463  5,
464  13,
465  3,
466  11,
467  7,
468  15,
469  0,
470  8,
471  4,
472  12,
473  2,
474  10,
475  6,
476  14,
477  1,
478  9,
479  5,
480  13,
481  3,
482  11,
483  7,
484  15);
485  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
486  0x0,
487  0x0,
488  0x0,
489  0x0,
490  0x0,
491  0x0,
492  0x0,
493  0xFF,
494  0xFF,
495  0xFF,
496  0xFF,
497  0xFF,
498  0xFF,
499  0xFF,
500  0xFF,
501  0x0,
502  0x0,
503  0x0,
504  0x0,
505  0x0,
506  0x0,
507  0x0,
508  0x0,
509  0xFF,
510  0xFF,
511  0xFF,
512  0xFF,
513  0xFF,
514  0xFF,
515  0xFF,
516  0xFF);
517  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
518  0x0,
519  0x0,
520  0x0,
521  0xFF,
522  0xFF,
523  0xFF,
524  0xFF,
525  0x0,
526  0x0,
527  0x0,
528  0x0,
529  0xFF,
530  0xFF,
531  0xFF,
532  0xFF,
533  0x0,
534  0x0,
535  0x0,
536  0x0,
537  0xFF,
538  0xFF,
539  0xFF,
540  0xFF,
541  0x0,
542  0x0,
543  0x0,
544  0x0,
545  0xFF,
546  0xFF,
547  0xFF,
548  0xFF);
549  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
550  0x0,
551  0xFF,
552  0xFF,
553  0x0,
554  0x0,
555  0xFF,
556  0xFF,
557  0x0,
558  0x0,
559  0xFF,
560  0xFF,
561  0x0,
562  0x0,
563  0xFF,
564  0xFF,
565  0x0,
566  0x0,
567  0xFF,
568  0xFF,
569  0x0,
570  0x0,
571  0xFF,
572  0xFF,
573  0x0,
574  0x0,
575  0xFF,
576  0xFF,
577  0x0,
578  0x0,
579  0xFF,
580  0xFF);
581 
582  for (branch = 0; branch < num_branches / 2; ++branch) {
583  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
584 
585  // prefetch next chunk
586  temp_ptr += 32;
587  __VOLK_PREFETCH(temp_ptr);
588 
589  // shuffle once for bit-reversal.
590  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
591 
592  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
593  shifted = _mm256_and_si256(shifted, mask_stage4);
594  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
595 
596 
597  shifted = _mm256_srli_si256(r_frame0, 4);
598  shifted = _mm256_and_si256(shifted, mask_stage3);
599  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
600 
601  shifted = _mm256_srli_si256(r_frame0, 2);
602  shifted = _mm256_and_si256(shifted, mask_stage2);
603  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
604 
605  shifted = _mm256_srli_si256(r_frame0, 1);
606  shifted = _mm256_and_si256(shifted, mask_stage1);
607  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
608 
609  // store result of chunk.
610  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
611  frame_ptr += 32;
612  }
613 }
614 #endif /* LV_HAVE_AVX2 */
615 
616 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
617 
618 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
620 
621 #ifdef LV_HAVE_SSSE3
622 #include <tmmintrin.h>
623 
624 static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
625  unsigned char* temp,
626  unsigned int frame_size)
627 {
628  const unsigned int po2 = log2_of_power_of_2(frame_size);
629 
630  unsigned int stage = po2;
631  unsigned char* frame_ptr = frame;
632  unsigned char* temp_ptr = temp;
633 
634  unsigned int frame_half = frame_size >> 1;
635  unsigned int num_branches = 1;
636  unsigned int branch;
637  unsigned int bit;
638 
639  // prepare constants
640  const __m128i mask_stage1 = _mm_set_epi8(0x0,
641  0xFF,
642  0x0,
643  0xFF,
644  0x0,
645  0xFF,
646  0x0,
647  0xFF,
648  0x0,
649  0xFF,
650  0x0,
651  0xFF,
652  0x0,
653  0xFF,
654  0x0,
655  0xFF);
656 
657  // get some SIMD registers to play with.
658  __m128i r_frame0, r_temp0, shifted;
659 
660  {
661  __m128i r_frame1, r_temp1;
662  const __m128i shuffle_separate =
663  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
664 
665  while (stage > 4) {
666  frame_ptr = frame;
667  temp_ptr = temp;
668 
669  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
670  for (branch = 0; branch < num_branches; ++branch) {
671  for (bit = 0; bit < frame_half; bit += 16) {
672  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
673  temp_ptr += 16;
674  r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
675  temp_ptr += 16;
676 
677  shifted = _mm_srli_si128(r_temp0, 1);
678  shifted = _mm_and_si128(shifted, mask_stage1);
679  r_temp0 = _mm_xor_si128(shifted, r_temp0);
680  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
681 
682  shifted = _mm_srli_si128(r_temp1, 1);
683  shifted = _mm_and_si128(shifted, mask_stage1);
684  r_temp1 = _mm_xor_si128(shifted, r_temp1);
685  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
686 
687  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
688  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
689 
690  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
691  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
692  frame_ptr += 16;
693  }
694 
695  frame_ptr += frame_half;
696  }
697  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
698 
699  num_branches = num_branches << 1;
700  frame_half = frame_half >> 1;
701  stage--;
702  }
703  }
704 
705  // This last part requires at least 16-bit frames.
706  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
707 
708  // reset pointers to correct positions.
709  frame_ptr = frame;
710  temp_ptr = temp;
711 
712  // prefetch first chunk
713  __VOLK_PREFETCH(temp_ptr);
714 
715  const __m128i shuffle_stage4 =
716  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
717  const __m128i mask_stage4 = _mm_set_epi8(0x0,
718  0x0,
719  0x0,
720  0x0,
721  0x0,
722  0x0,
723  0x0,
724  0x0,
725  0xFF,
726  0xFF,
727  0xFF,
728  0xFF,
729  0xFF,
730  0xFF,
731  0xFF,
732  0xFF);
733  const __m128i mask_stage3 = _mm_set_epi8(0x0,
734  0x0,
735  0x0,
736  0x0,
737  0xFF,
738  0xFF,
739  0xFF,
740  0xFF,
741  0x0,
742  0x0,
743  0x0,
744  0x0,
745  0xFF,
746  0xFF,
747  0xFF,
748  0xFF);
749  const __m128i mask_stage2 = _mm_set_epi8(0x0,
750  0x0,
751  0xFF,
752  0xFF,
753  0x0,
754  0x0,
755  0xFF,
756  0xFF,
757  0x0,
758  0x0,
759  0xFF,
760  0xFF,
761  0x0,
762  0x0,
763  0xFF,
764  0xFF);
765 
766  for (branch = 0; branch < num_branches; ++branch) {
767  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
768 
769  // prefetch next chunk
770  temp_ptr += 16;
771  __VOLK_PREFETCH(temp_ptr);
772 
773  // shuffle once for bit-reversal.
774  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
775 
776  shifted = _mm_srli_si128(r_temp0, 8);
777  shifted = _mm_and_si128(shifted, mask_stage4);
778  r_frame0 = _mm_xor_si128(shifted, r_temp0);
779 
780  shifted = _mm_srli_si128(r_frame0, 4);
781  shifted = _mm_and_si128(shifted, mask_stage3);
782  r_frame0 = _mm_xor_si128(shifted, r_frame0);
783 
784  shifted = _mm_srli_si128(r_frame0, 2);
785  shifted = _mm_and_si128(shifted, mask_stage2);
786  r_frame0 = _mm_xor_si128(shifted, r_frame0);
787 
788  shifted = _mm_srli_si128(r_frame0, 1);
789  shifted = _mm_and_si128(shifted, mask_stage1);
790  r_frame0 = _mm_xor_si128(shifted, r_frame0);
791 
792  // store result of chunk.
793  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
794  frame_ptr += 16;
795  }
796 }
797 #endif /* LV_HAVE_SSSE3 */
798 
799 #ifdef LV_HAVE_AVX2
800 #include <immintrin.h>
801 
802 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
803  unsigned char* temp,
804  unsigned int frame_size)
805 {
806  const unsigned int po2 = log2_of_power_of_2(frame_size);
807 
808  unsigned int stage = po2;
809  unsigned char* frame_ptr = frame;
810  unsigned char* temp_ptr = temp;
811 
812  unsigned int frame_half = frame_size >> 1;
813  unsigned int num_branches = 1;
814  unsigned int branch;
815  unsigned int bit;
816 
817  // prepare constants
818  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
819  0xFF,
820  0x0,
821  0xFF,
822  0x0,
823  0xFF,
824  0x0,
825  0xFF,
826  0x0,
827  0xFF,
828  0x0,
829  0xFF,
830  0x0,
831  0xFF,
832  0x0,
833  0xFF,
834  0x0,
835  0xFF,
836  0x0,
837  0xFF,
838  0x0,
839  0xFF,
840  0x0,
841  0xFF,
842  0x0,
843  0xFF,
844  0x0,
845  0xFF,
846  0x0,
847  0xFF,
848  0x0,
849  0xFF);
850 
851  const __m128i mask_stage0 = _mm_set_epi8(0x0,
852  0xFF,
853  0x0,
854  0xFF,
855  0x0,
856  0xFF,
857  0x0,
858  0xFF,
859  0x0,
860  0xFF,
861  0x0,
862  0xFF,
863  0x0,
864  0xFF,
865  0x0,
866  0xFF);
867  // get some SIMD registers to play with.
868  __m256i r_frame0, r_temp0, shifted;
869  __m128i r_temp2, r_frame2, shifted2;
870  {
871  __m256i r_frame1, r_temp1;
872  __m128i r_frame3, r_temp3;
873  const __m256i shuffle_separate = _mm256_setr_epi8(0,
874  2,
875  4,
876  6,
877  8,
878  10,
879  12,
880  14,
881  1,
882  3,
883  5,
884  7,
885  9,
886  11,
887  13,
888  15,
889  0,
890  2,
891  4,
892  6,
893  8,
894  10,
895  12,
896  14,
897  1,
898  3,
899  5,
900  7,
901  9,
902  11,
903  13,
904  15);
905  const __m128i shuffle_separate128 =
906  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
907 
908  while (stage > 4) {
909  frame_ptr = frame;
910  temp_ptr = temp;
911 
912  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
913  for (branch = 0; branch < num_branches; ++branch) {
914  for (bit = 0; bit < frame_half; bit += 32) {
915  if ((frame_half - bit) <
916  32) // if only 16 bits remaining in frame, not 32
917  {
918  r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
919  temp_ptr += 16;
920  r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
921  temp_ptr += 16;
922 
923  shifted2 = _mm_srli_si128(r_temp2, 1);
924  shifted2 = _mm_and_si128(shifted2, mask_stage0);
925  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
926  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
927 
928  shifted2 = _mm_srli_si128(r_temp3, 1);
929  shifted2 = _mm_and_si128(shifted2, mask_stage0);
930  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
931  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
932 
933  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
934  _mm_store_si128((__m128i*)frame_ptr, r_frame2);
935 
936  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
937  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
938  frame_ptr += 16;
939  break;
940  }
941  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
942  temp_ptr += 32;
943  r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
944  temp_ptr += 32;
945 
946  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
947  shifted = _mm256_and_si256(shifted, mask_stage1);
948  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
949  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
950 
951  shifted = _mm256_srli_si256(r_temp1, 1);
952  shifted = _mm256_and_si256(shifted, mask_stage1);
953  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
954  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
955 
956  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
957  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
958  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
959  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
960 
961  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
962 
963  _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
964  frame_ptr += 32;
965  }
966 
967  frame_ptr += frame_half;
968  }
969  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
970 
971  num_branches = num_branches << 1;
972  frame_half = frame_half >> 1;
973  stage--;
974  }
975  }
976 
977  // This last part requires at least 32-bit frames.
978  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
979 
980  // reset pointers to correct positions.
981  frame_ptr = frame;
982  temp_ptr = temp;
983 
984  // prefetch first chunk.
985  __VOLK_PREFETCH(temp_ptr);
986 
987  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
988  8,
989  4,
990  12,
991  2,
992  10,
993  6,
994  14,
995  1,
996  9,
997  5,
998  13,
999  3,
1000  11,
1001  7,
1002  15,
1003  0,
1004  8,
1005  4,
1006  12,
1007  2,
1008  10,
1009  6,
1010  14,
1011  1,
1012  9,
1013  5,
1014  13,
1015  3,
1016  11,
1017  7,
1018  15);
1019  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1020  0x0,
1021  0x0,
1022  0x0,
1023  0x0,
1024  0x0,
1025  0x0,
1026  0x0,
1027  0xFF,
1028  0xFF,
1029  0xFF,
1030  0xFF,
1031  0xFF,
1032  0xFF,
1033  0xFF,
1034  0xFF,
1035  0x0,
1036  0x0,
1037  0x0,
1038  0x0,
1039  0x0,
1040  0x0,
1041  0x0,
1042  0x0,
1043  0xFF,
1044  0xFF,
1045  0xFF,
1046  0xFF,
1047  0xFF,
1048  0xFF,
1049  0xFF,
1050  0xFF);
1051  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1052  0x0,
1053  0x0,
1054  0x0,
1055  0xFF,
1056  0xFF,
1057  0xFF,
1058  0xFF,
1059  0x0,
1060  0x0,
1061  0x0,
1062  0x0,
1063  0xFF,
1064  0xFF,
1065  0xFF,
1066  0xFF,
1067  0x0,
1068  0x0,
1069  0x0,
1070  0x0,
1071  0xFF,
1072  0xFF,
1073  0xFF,
1074  0xFF,
1075  0x0,
1076  0x0,
1077  0x0,
1078  0x0,
1079  0xFF,
1080  0xFF,
1081  0xFF,
1082  0xFF);
1083  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1084  0x0,
1085  0xFF,
1086  0xFF,
1087  0x0,
1088  0x0,
1089  0xFF,
1090  0xFF,
1091  0x0,
1092  0x0,
1093  0xFF,
1094  0xFF,
1095  0x0,
1096  0x0,
1097  0xFF,
1098  0xFF,
1099  0x0,
1100  0x0,
1101  0xFF,
1102  0xFF,
1103  0x0,
1104  0x0,
1105  0xFF,
1106  0xFF,
1107  0x0,
1108  0x0,
1109  0xFF,
1110  0xFF,
1111  0x0,
1112  0x0,
1113  0xFF,
1114  0xFF);
1115 
1116  for (branch = 0; branch < num_branches / 2; ++branch) {
1117  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1118 
1119  // prefetch next chunk
1120  temp_ptr += 32;
1121  __VOLK_PREFETCH(temp_ptr);
1122 
1123  // shuffle once for bit-reversal.
1124  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1125 
1126  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1127  shifted = _mm256_and_si256(shifted, mask_stage4);
1128  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1129 
1130  shifted = _mm256_srli_si256(r_frame0, 4);
1131  shifted = _mm256_and_si256(shifted, mask_stage3);
1132  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1133 
1134  shifted = _mm256_srli_si256(r_frame0, 2);
1135  shifted = _mm256_and_si256(shifted, mask_stage2);
1136  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1137 
1138  shifted = _mm256_srli_si256(r_frame0, 1);
1139  shifted = _mm256_and_si256(shifted, mask_stage1);
1140  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1141 
1142  // store result of chunk.
1143  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1144  frame_ptr += 32;
1145  }
1146 }
1147 #endif /* LV_HAVE_AVX2 */
1148 
1149 
1150 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
val
Definition: volk_arch_defs.py:66
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:624
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:65
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:31
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:89
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62