Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
73 #define INCLUDED_volk_32f_binary_slicer_8i_H
74 
75 
76 #ifdef LV_HAVE_GENERIC
77 
78 static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
79  const float* aVector,
80  unsigned int num_points)
81 {
82  int8_t* cPtr = cVector;
83  const float* aPtr = aVector;
84  unsigned int number = 0;
85 
86  for (number = 0; number < num_points; number++) {
87  if (*aPtr++ >= 0) {
88  *cPtr++ = 1;
89  } else {
90  *cPtr++ = 0;
91  }
92  }
93 }
94 #endif /* LV_HAVE_GENERIC */
95 
96 
97 #ifdef LV_HAVE_GENERIC
98 
99 static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
100  const float* aVector,
101  unsigned int num_points)
102 {
103  int8_t* cPtr = cVector;
104  const float* aPtr = aVector;
105  unsigned int number = 0;
106 
107  for (number = 0; number < num_points; number++) {
108  *cPtr++ = (*aPtr++ >= 0);
109  }
110 }
111 #endif /* LV_HAVE_GENERIC */
112 
113 
114 #ifdef LV_HAVE_AVX2
115 #include <immintrin.h>
116 
117 static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
118  const float* aVector,
119  unsigned int num_points)
120 {
121  int8_t* cPtr = cVector;
122  const float* aPtr = aVector;
123  unsigned int number = 0;
124  unsigned int n32points = num_points / 32;
125 
126  const __m256 zero_val = _mm256_set1_ps(0.0f);
127  __m256 a0_val, a1_val, a2_val, a3_val;
128  __m256 res0_f, res1_f, res2_f, res3_f;
129  __m256i res0_i, res1_i, res2_i, res3_i;
130  __m256i byte_shuffle = _mm256_set_epi8(15,
131  14,
132  13,
133  12,
134  7,
135  6,
136  5,
137  4,
138  11,
139  10,
140  9,
141  8,
142  3,
143  2,
144  1,
145  0,
146  15,
147  14,
148  13,
149  12,
150  7,
151  6,
152  5,
153  4,
154  11,
155  10,
156  9,
157  8,
158  3,
159  2,
160  1,
161  0);
162 
163  for (number = 0; number < n32points; number++) {
164  a0_val = _mm256_load_ps(aPtr);
165  a1_val = _mm256_load_ps(aPtr + 8);
166  a2_val = _mm256_load_ps(aPtr + 16);
167  a3_val = _mm256_load_ps(aPtr + 24);
168 
169  // compare >= 0; return float
170  res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
171  res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
172  res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
173  res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
174 
175  // convert to 32i and >> 31
176  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
177  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
178  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
179  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
180 
181  // pack in to 16-bit results
182  res0_i = _mm256_packs_epi32(res0_i, res1_i);
183  res2_i = _mm256_packs_epi32(res2_i, res3_i);
184  // pack in to 8-bit results
185  // res0: (after packs_epi32)
186  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
187  // res2:
188  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
189  res0_i = _mm256_packs_epi16(res0_i, res2_i);
190  // shuffle the lanes
191  // res0: (after packs_epi16)
192  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
193  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
194  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
195  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
196 
197  // shuffle bytes within lanes
198  // res0: (after shuffle_epi8)
199  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
200  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
201  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
202 
203  _mm256_store_si256((__m256i*)cPtr, res0_i);
204  aPtr += 32;
205  cPtr += 32;
206  }
207 
208  for (number = n32points * 32; number < num_points; number++) {
209  if (*aPtr++ >= 0) {
210  *cPtr++ = 1;
211  } else {
212  *cPtr++ = 0;
213  }
214  }
215 }
216 #endif
217 
218 #ifdef LV_HAVE_AVX2
219 #include <immintrin.h>
220 
221 static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
222  const float* aVector,
223  unsigned int num_points)
224 {
225  int8_t* cPtr = cVector;
226  const float* aPtr = aVector;
227  unsigned int number = 0;
228  unsigned int n32points = num_points / 32;
229 
230  const __m256 zero_val = _mm256_set1_ps(0.0f);
231  __m256 a0_val, a1_val, a2_val, a3_val;
232  __m256 res0_f, res1_f, res2_f, res3_f;
233  __m256i res0_i, res1_i, res2_i, res3_i;
234  __m256i byte_shuffle = _mm256_set_epi8(15,
235  14,
236  13,
237  12,
238  7,
239  6,
240  5,
241  4,
242  11,
243  10,
244  9,
245  8,
246  3,
247  2,
248  1,
249  0,
250  15,
251  14,
252  13,
253  12,
254  7,
255  6,
256  5,
257  4,
258  11,
259  10,
260  9,
261  8,
262  3,
263  2,
264  1,
265  0);
266 
267  for (number = 0; number < n32points; number++) {
268  a0_val = _mm256_loadu_ps(aPtr);
269  a1_val = _mm256_loadu_ps(aPtr + 8);
270  a2_val = _mm256_loadu_ps(aPtr + 16);
271  a3_val = _mm256_loadu_ps(aPtr + 24);
272 
273  // compare >= 0; return float
274  res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
275  res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
276  res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
277  res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
278 
279  // convert to 32i and >> 31
280  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
281  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
282  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
283  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
284 
285  // pack in to 16-bit results
286  res0_i = _mm256_packs_epi32(res0_i, res1_i);
287  res2_i = _mm256_packs_epi32(res2_i, res3_i);
288  // pack in to 8-bit results
289  // res0: (after packs_epi32)
290  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
291  // res2:
292  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
293  res0_i = _mm256_packs_epi16(res0_i, res2_i);
294  // shuffle the lanes
295  // res0: (after packs_epi16)
296  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
297  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
298  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
299  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
300 
301  // shuffle bytes within lanes
302  // res0: (after shuffle_epi8)
303  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
304  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
305  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
306 
307  _mm256_storeu_si256((__m256i*)cPtr, res0_i);
308  aPtr += 32;
309  cPtr += 32;
310  }
311 
312  for (number = n32points * 32; number < num_points; number++) {
313  if (*aPtr++ >= 0) {
314  *cPtr++ = 1;
315  } else {
316  *cPtr++ = 0;
317  }
318  }
319 }
320 #endif
321 
322 
323 #ifdef LV_HAVE_SSE2
324 
325 #include <emmintrin.h>
326 
327 static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
328  const float* aVector,
329  unsigned int num_points)
330 {
331  int8_t* cPtr = cVector;
332  const float* aPtr = aVector;
333  unsigned int number = 0;
334 
335  unsigned int n16points = num_points / 16;
336  __m128 a0_val, a1_val, a2_val, a3_val;
337  __m128 res0_f, res1_f, res2_f, res3_f;
338  __m128i res0_i, res1_i, res2_i, res3_i;
339  __m128 zero_val;
340  zero_val = _mm_set1_ps(0.0f);
341 
342  for (number = 0; number < n16points; number++) {
343  a0_val = _mm_load_ps(aPtr);
344  a1_val = _mm_load_ps(aPtr + 4);
345  a2_val = _mm_load_ps(aPtr + 8);
346  a3_val = _mm_load_ps(aPtr + 12);
347 
348  // compare >= 0; return float
349  res0_f = _mm_cmpge_ps(a0_val, zero_val);
350  res1_f = _mm_cmpge_ps(a1_val, zero_val);
351  res2_f = _mm_cmpge_ps(a2_val, zero_val);
352  res3_f = _mm_cmpge_ps(a3_val, zero_val);
353 
354  // convert to 32i and >> 31
355  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
356  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
357  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
358  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
359 
360  // pack into 16-bit results
361  res0_i = _mm_packs_epi32(res0_i, res1_i);
362  res2_i = _mm_packs_epi32(res2_i, res3_i);
363 
364  // pack into 8-bit results
365  res0_i = _mm_packs_epi16(res0_i, res2_i);
366 
367  _mm_store_si128((__m128i*)cPtr, res0_i);
368 
369  cPtr += 16;
370  aPtr += 16;
371  }
372 
373  for (number = n16points * 16; number < num_points; number++) {
374  if (*aPtr++ >= 0) {
375  *cPtr++ = 1;
376  } else {
377  *cPtr++ = 0;
378  }
379  }
380 }
381 #endif /* LV_HAVE_SSE2 */
382 
383 
384 #ifdef LV_HAVE_SSE2
385 #include <emmintrin.h>
386 
387 static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
388  const float* aVector,
389  unsigned int num_points)
390 {
391  int8_t* cPtr = cVector;
392  const float* aPtr = aVector;
393  unsigned int number = 0;
394 
395  unsigned int n16points = num_points / 16;
396  __m128 a0_val, a1_val, a2_val, a3_val;
397  __m128 res0_f, res1_f, res2_f, res3_f;
398  __m128i res0_i, res1_i, res2_i, res3_i;
399  __m128 zero_val;
400  zero_val = _mm_set1_ps(0.0f);
401 
402  for (number = 0; number < n16points; number++) {
403  a0_val = _mm_loadu_ps(aPtr);
404  a1_val = _mm_loadu_ps(aPtr + 4);
405  a2_val = _mm_loadu_ps(aPtr + 8);
406  a3_val = _mm_loadu_ps(aPtr + 12);
407 
408  // compare >= 0; return float
409  res0_f = _mm_cmpge_ps(a0_val, zero_val);
410  res1_f = _mm_cmpge_ps(a1_val, zero_val);
411  res2_f = _mm_cmpge_ps(a2_val, zero_val);
412  res3_f = _mm_cmpge_ps(a3_val, zero_val);
413 
414  // convert to 32i and >> 31
415  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
416  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
417  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
418  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
419 
420  // pack into 16-bit results
421  res0_i = _mm_packs_epi32(res0_i, res1_i);
422  res2_i = _mm_packs_epi32(res2_i, res3_i);
423 
424  // pack into 8-bit results
425  res0_i = _mm_packs_epi16(res0_i, res2_i);
426 
427  _mm_storeu_si128((__m128i*)cPtr, res0_i);
428 
429  cPtr += 16;
430  aPtr += 16;
431  }
432 
433  for (number = n16points * 16; number < num_points; number++) {
434  if (*aPtr++ >= 0) {
435  *cPtr++ = 1;
436  } else {
437  *cPtr++ = 0;
438  }
439  }
440 }
441 #endif /* LV_HAVE_SSE2 */
442 
443 
444 #ifdef LV_HAVE_NEON
445 #include <arm_neon.h>
446 
447 static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
448  const float* aVector,
449  unsigned int num_points)
450 {
451  int8_t* cPtr = cVector;
452  const float* aPtr = aVector;
453  unsigned int number = 0;
454  unsigned int n16points = num_points / 16;
455 
456  float32x4x2_t input_val0, input_val1;
457  float32x4_t zero_val;
458  uint32x4x2_t res0_u32, res1_u32;
459  uint16x4x2_t res0_u16x4, res1_u16x4;
460  uint16x8x2_t res_u16x8;
461  uint8x8x2_t res_u8;
462  uint8x8_t one;
463 
464  zero_val = vdupq_n_f32(0.0);
465  one = vdup_n_u8(0x01);
466 
467  // TODO: this is a good candidate for asm because the vcombines
468  // can be eliminated simply by picking dst registers that are
469  // adjacent.
470  for (number = 0; number < n16points; number++) {
471  input_val0 = vld2q_f32(aPtr);
472  input_val1 = vld2q_f32(aPtr + 8);
473 
474  // test against 0; return uint32
475  res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
476  res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
477  res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
478  res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
479 
480  // narrow uint32 -> uint16 followed by combine to 8-element vectors
481  res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
482  res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
483  res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
484  res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
485 
486  res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
487  res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
488 
489  // narrow uint16x8 -> uint8x8
490  res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
491  res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
492  // we *could* load twice as much data and do another vcombine here
493  // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
494  // but that turns out to be ~16% slower than this version on zc702
495  // it's possible register contention in GCC scheduler slows it down
496  // and a hand-written asm with quad-word u8 registers is much faster.
497 
498  res_u8.val[0] = vand_u8(one, res_u8.val[0]);
499  res_u8.val[1] = vand_u8(one, res_u8.val[1]);
500 
501  vst2_u8((unsigned char*)cPtr, res_u8);
502  cPtr += 16;
503  aPtr += 16;
504  }
505 
506  for (number = n16points * 16; number < num_points; number++) {
507  if (*aPtr++ >= 0) {
508  *cPtr++ = 1;
509  } else {
510  *cPtr++ = 0;
511  }
512  }
513 }
514 #endif /* LV_HAVE_NEON */
515 
516 
517 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:99
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:327
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:387
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:447
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:78