Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * This file is intended to hold AVX2 intrinsics of intrinsics.
25  * They should be used in VOLK kernels to avoid copy-paste.
26  */
27 
28 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
31 #include <immintrin.h>
32 
33 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
34 {
35  const __m128i zeros = _mm_set1_epi8(0x00);
36  const __m128i sign_extract = _mm_set1_epi8(0x80);
37  const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
38  0xff,
39  0xff,
40  0x00,
41  0xff,
42  0xff,
43  0xff,
44  0x01,
45  0xff,
46  0xff,
47  0xff,
48  0x02,
49  0xff,
50  0xff,
51  0xff,
52  0x03,
53  0xff,
54  0xff,
55  0xff,
56  0x04,
57  0xff,
58  0xff,
59  0xff,
60  0x05,
61  0xff,
62  0xff,
63  0xff,
64  0x06,
65  0xff,
66  0xff,
67  0xff,
68  0x07);
69  __m256i sign_bits = _mm256_setzero_si256();
70 
71  fbits = _mm_cmpgt_epi8(fbits, zeros);
72  fbits = _mm_and_si128(fbits, sign_extract);
73  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
74  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
75  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
76 
77  return _mm256_castsi256_ps(sign_bits);
78 }
79 
80 static inline __m256
81 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
82 {
83  // prepare sign mask for correct +-
84  __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
85 
86  __m256 llr0, llr1;
87  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
88 
89  // calculate result
90  llr0 = _mm256_xor_ps(llr0, sign_mask);
91  __m256 dst = _mm256_add_ps(llr0, llr1);
92  return dst;
93 }
94 
95 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
96  const __m256 cplxValue1)
97 {
98  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
99  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
100  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
101  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
102  return _mm256_permutevar8x32_ps(complex_result, idx);
103 }
104 
105 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
106  const __m256 symbols1,
107  const __m256 points0,
108  const __m256 points1,
109  const __m256 scalar)
110 {
111  /*
112  * Calculate: |y - x|^2 * SNR_lin
113  * Consider 'symbolsX' and 'pointsX' to be complex float
114  * 'symbolsX' are 'y' and 'pointsX' are 'x'
115  */
116  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
117  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
118  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
119  return _mm256_mul_ps(norms, scalar);
120 }
121 
122 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:33
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:95
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158