Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifdef LV_HAVE_GENERIC
74 #include <volk/volk_common.h>
75 
76 static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
77  const lv_32fc_t* complexVector,
78  const float scalar,
79  unsigned int num_points)
80 {
81  const float* complexVectorPtr = (float*)complexVector;
82  int16_t* magnitudeVectorPtr = magnitudeVector;
83  unsigned int number = 0;
84  for (number = 0; number < num_points; number++) {
85  __VOLK_VOLATILE float real = *complexVectorPtr++;
86  __VOLK_VOLATILE float imag = *complexVectorPtr++;
87  real *= real;
88  imag *= imag;
89  *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
90  }
91 }
92 #endif /* LV_HAVE_GENERIC */
93 
94 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
95 #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
96 
97 #include <inttypes.h>
98 #include <math.h>
99 #include <stdio.h>
100 #include <volk/volk_common.h>
101 
102 #ifdef LV_HAVE_AVX2
103 #include <immintrin.h>
104 
105 static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
106  const lv_32fc_t* complexVector,
107  const float scalar,
108  unsigned int num_points)
109 {
110  unsigned int number = 0;
111  const unsigned int eighthPoints = num_points / 8;
112 
113  const float* complexVectorPtr = (const float*)complexVector;
114  int16_t* magnitudeVectorPtr = magnitudeVector;
115 
116  __m256 vScalar = _mm256_set1_ps(scalar);
117  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
118  __m256 cplxValue1, cplxValue2, result;
119  __m256i resultInt;
120  __m128i resultShort;
121 
122  for (; number < eighthPoints; number++) {
123  cplxValue1 = _mm256_load_ps(complexVectorPtr);
124  complexVectorPtr += 8;
125 
126  cplxValue2 = _mm256_load_ps(complexVectorPtr);
127  complexVectorPtr += 8;
128 
129  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
130  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
131 
132  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
133 
134  result = _mm256_sqrt_ps(result);
135 
136  result = _mm256_mul_ps(result, vScalar);
137 
138  resultInt = _mm256_cvtps_epi32(result);
139  resultInt = _mm256_packs_epi32(resultInt, resultInt);
140  resultInt = _mm256_permutevar8x32_epi32(
141  resultInt, idx); // permute to compensate for shuffling in hadd and packs
142  resultShort = _mm256_extracti128_si256(resultInt, 0);
143  _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
144  magnitudeVectorPtr += 8;
145  }
146 
147  number = eighthPoints * 8;
149  magnitudeVector + number, complexVector + number, scalar, num_points - number);
150 }
151 #endif /* LV_HAVE_AVX2 */
152 
153 #ifdef LV_HAVE_SSE3
154 #include <pmmintrin.h>
155 
156 static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
157  const lv_32fc_t* complexVector,
158  const float scalar,
159  unsigned int num_points)
160 {
161  unsigned int number = 0;
162  const unsigned int quarterPoints = num_points / 4;
163 
164  const float* complexVectorPtr = (const float*)complexVector;
165  int16_t* magnitudeVectorPtr = magnitudeVector;
166 
167  __m128 vScalar = _mm_set_ps1(scalar);
168 
169  __m128 cplxValue1, cplxValue2, result;
170 
171  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
172 
173  for (; number < quarterPoints; number++) {
174  cplxValue1 = _mm_load_ps(complexVectorPtr);
175  complexVectorPtr += 4;
176 
177  cplxValue2 = _mm_load_ps(complexVectorPtr);
178  complexVectorPtr += 4;
179 
180  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
181  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
182 
183  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
184 
185  result = _mm_sqrt_ps(result);
186 
187  result = _mm_mul_ps(result, vScalar);
188 
189  _mm_store_ps(floatBuffer, result);
190  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
191  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
192  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
193  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
194  }
195 
196  number = quarterPoints * 4;
198  magnitudeVector + number, complexVector + number, scalar, num_points - number);
199 }
200 #endif /* LV_HAVE_SSE3 */
201 
202 
203 #ifdef LV_HAVE_SSE
204 #include <xmmintrin.h>
205 
206 static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
207  const lv_32fc_t* complexVector,
208  const float scalar,
209  unsigned int num_points)
210 {
211  unsigned int number = 0;
212  const unsigned int quarterPoints = num_points / 4;
213 
214  const float* complexVectorPtr = (const float*)complexVector;
215  int16_t* magnitudeVectorPtr = magnitudeVector;
216 
217  __m128 vScalar = _mm_set_ps1(scalar);
218 
219  __m128 cplxValue1, cplxValue2, result;
220  __m128 iValue, qValue;
221 
222  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
223 
224  for (; number < quarterPoints; number++) {
225  cplxValue1 = _mm_load_ps(complexVectorPtr);
226  complexVectorPtr += 4;
227 
228  cplxValue2 = _mm_load_ps(complexVectorPtr);
229  complexVectorPtr += 4;
230 
231  // Arrange in i1i2i3i4 format
232  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
233  // Arrange in q1q2q3q4 format
234  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
235 
236  __VOLK_VOLATILE __m128 iValue2 =
237  _mm_mul_ps(iValue, iValue); // Square the I values
238  __VOLK_VOLATILE __m128 qValue2 =
239  _mm_mul_ps(qValue, qValue); // Square the Q Values
240 
241  result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
242 
243  result = _mm_sqrt_ps(result);
244 
245  result = _mm_mul_ps(result, vScalar);
246 
247  _mm_store_ps(floatBuffer, result);
248  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
249  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
250  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
251  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
252  }
253 
254  number = quarterPoints * 4;
256  magnitudeVector + number, complexVector + number, scalar, num_points - number);
257 }
258 #endif /* LV_HAVE_SSE */
259 
260 
261 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
262 
263 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
264 #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
265 
266 #include <inttypes.h>
267 #include <math.h>
268 #include <stdio.h>
269 #include <volk/volk_common.h>
270 
271 #ifdef LV_HAVE_AVX2
272 #include <immintrin.h>
273 
274 static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
275  const lv_32fc_t* complexVector,
276  const float scalar,
277  unsigned int num_points)
278 {
279  unsigned int number = 0;
280  const unsigned int eighthPoints = num_points / 8;
281 
282  const float* complexVectorPtr = (const float*)complexVector;
283  int16_t* magnitudeVectorPtr = magnitudeVector;
284 
285  __m256 vScalar = _mm256_set1_ps(scalar);
286  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
287  __m256 cplxValue1, cplxValue2, result;
288  __m256i resultInt;
289  __m128i resultShort;
290 
291  for (; number < eighthPoints; number++) {
292  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
293  complexVectorPtr += 8;
294 
295  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
296  complexVectorPtr += 8;
297 
298  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
299  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
300 
301  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
302 
303  result = _mm256_sqrt_ps(result);
304 
305  result = _mm256_mul_ps(result, vScalar);
306 
307  resultInt = _mm256_cvtps_epi32(result);
308  resultInt = _mm256_packs_epi32(resultInt, resultInt);
309  resultInt = _mm256_permutevar8x32_epi32(
310  resultInt, idx); // permute to compensate for shuffling in hadd and packs
311  resultShort = _mm256_extracti128_si256(resultInt, 0);
312  _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
313  magnitudeVectorPtr += 8;
314  }
315 
316  number = eighthPoints * 8;
318  magnitudeVector + number, complexVector + number, scalar, num_points - number);
319 }
320 #endif /* LV_HAVE_AVX2 */
321 
322 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32fc_s32f_magnitude_16i_generic(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:76
static void volk_32fc_s32f_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:206
static void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:156
#define __VOLK_VOLATILE
Definition: volk_common.h:64
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:70