Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
74 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
75 
76 #include <inttypes.h>
77 #include <stdio.h>
78 
79 static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
80 {
81  float min_val = INT8_MIN;
82  float max_val = INT8_MAX;
83  if (in > max_val) {
84  *out = (int8_t)(max_val);
85  } else if (in < min_val) {
86  *out = (int8_t)(min_val);
87  } else {
88  *out = (int8_t)(rintf(in));
89  }
90 }
91 
92 #ifdef LV_HAVE_AVX2
93 #include <immintrin.h>
94 
95 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
96  const float* inputVector,
97  const float scalar,
98  unsigned int num_points)
99 {
100  unsigned int number = 0;
101 
102  const unsigned int thirtysecondPoints = num_points / 32;
103 
104  const float* inputVectorPtr = (const float*)inputVector;
105  int8_t* outputVectorPtr = outputVector;
106 
107  float min_val = INT8_MIN;
108  float max_val = INT8_MAX;
109  float r;
110 
111  __m256 vScalar = _mm256_set1_ps(scalar);
112  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
113  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
114  __m256 vmin_val = _mm256_set1_ps(min_val);
115  __m256 vmax_val = _mm256_set1_ps(max_val);
116  __m256i intInputVal;
117 
118  for (; number < thirtysecondPoints; number++) {
119  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
120  inputVectorPtr += 8;
121  inputVal2 = _mm256_loadu_ps(inputVectorPtr);
122  inputVectorPtr += 8;
123  inputVal3 = _mm256_loadu_ps(inputVectorPtr);
124  inputVectorPtr += 8;
125  inputVal4 = _mm256_loadu_ps(inputVectorPtr);
126  inputVectorPtr += 8;
127 
128  inputVal1 = _mm256_max_ps(
129  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
130  inputVal2 = _mm256_max_ps(
131  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
132  inputVal3 = _mm256_max_ps(
133  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
134  inputVal4 = _mm256_max_ps(
135  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
136 
137  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
138  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
139  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
140  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
141 
142  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
143  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
144  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
145  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
146 
147  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
148  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
149 
150  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
151  outputVectorPtr += 32;
152  }
153 
154  number = thirtysecondPoints * 32;
155  for (; number < num_points; number++) {
156  r = inputVector[number] * scalar;
157  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
158  }
159 }
160 
161 #endif /* LV_HAVE_AVX2 */
162 
163 
164 #ifdef LV_HAVE_SSE2
165 #include <emmintrin.h>
166 
167 static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
168  const float* inputVector,
169  const float scalar,
170  unsigned int num_points)
171 {
172  unsigned int number = 0;
173 
174  const unsigned int sixteenthPoints = num_points / 16;
175 
176  const float* inputVectorPtr = (const float*)inputVector;
177  int8_t* outputVectorPtr = outputVector;
178 
179  float min_val = INT8_MIN;
180  float max_val = INT8_MAX;
181  float r;
182 
183  __m128 vScalar = _mm_set_ps1(scalar);
184  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
185  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
186  __m128 vmin_val = _mm_set_ps1(min_val);
187  __m128 vmax_val = _mm_set_ps1(max_val);
188 
189  for (; number < sixteenthPoints; number++) {
190  inputVal1 = _mm_loadu_ps(inputVectorPtr);
191  inputVectorPtr += 4;
192  inputVal2 = _mm_loadu_ps(inputVectorPtr);
193  inputVectorPtr += 4;
194  inputVal3 = _mm_loadu_ps(inputVectorPtr);
195  inputVectorPtr += 4;
196  inputVal4 = _mm_loadu_ps(inputVectorPtr);
197  inputVectorPtr += 4;
198 
199  inputVal1 =
200  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
201  inputVal2 =
202  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
203  inputVal3 =
204  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
205  inputVal4 =
206  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
207 
208  intInputVal1 = _mm_cvtps_epi32(inputVal1);
209  intInputVal2 = _mm_cvtps_epi32(inputVal2);
210  intInputVal3 = _mm_cvtps_epi32(inputVal3);
211  intInputVal4 = _mm_cvtps_epi32(inputVal4);
212 
213  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
214  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
215 
216  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
217 
218  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
219  outputVectorPtr += 16;
220  }
221 
222  number = sixteenthPoints * 16;
223  for (; number < num_points; number++) {
224  r = inputVector[number] * scalar;
225  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
226  }
227 }
228 
229 #endif /* LV_HAVE_SSE2 */
230 
231 
232 #ifdef LV_HAVE_SSE
233 #include <xmmintrin.h>
234 
235 static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
236  const float* inputVector,
237  const float scalar,
238  unsigned int num_points)
239 {
240  unsigned int number = 0;
241  size_t inner_loop;
242 
243  const unsigned int quarterPoints = num_points / 4;
244 
245  const float* inputVectorPtr = (const float*)inputVector;
246  int8_t* outputVectorPtr = outputVector;
247 
248  float min_val = INT8_MIN;
249  float max_val = INT8_MAX;
250  float r;
251 
252  __m128 vScalar = _mm_set_ps1(scalar);
253  __m128 ret;
254  __m128 vmin_val = _mm_set_ps1(min_val);
255  __m128 vmax_val = _mm_set_ps1(max_val);
256 
257  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
258 
259  for (; number < quarterPoints; number++) {
260  ret = _mm_loadu_ps(inputVectorPtr);
261  inputVectorPtr += 4;
262 
263  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
264 
265  _mm_store_ps(outputFloatBuffer, ret);
266  for (inner_loop = 0; inner_loop < 4; inner_loop++) {
267  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
268  }
269  }
270 
271  number = quarterPoints * 4;
272  for (; number < num_points; number++) {
273  r = inputVector[number] * scalar;
274  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
275  }
276 }
277 
278 #endif /* LV_HAVE_SSE */
279 
280 
281 #ifdef LV_HAVE_GENERIC
282 
283 static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
284  const float* inputVector,
285  const float scalar,
286  unsigned int num_points)
287 {
288  const float* inputVectorPtr = inputVector;
289  unsigned int number = 0;
290  float r;
291 
292  for (number = 0; number < num_points; number++) {
293  r = *inputVectorPtr++ * scalar;
294  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
295  }
296 }
297 
298 #endif /* LV_HAVE_GENERIC */
299 
300 
301 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
302 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
303 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
304 
305 #include <inttypes.h>
306 #include <stdio.h>
307 #include <volk/volk_common.h>
308 
309 #ifdef LV_HAVE_AVX2
310 #include <immintrin.h>
311 
312 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
313  const float* inputVector,
314  const float scalar,
315  unsigned int num_points)
316 {
317  unsigned int number = 0;
318 
319  const unsigned int thirtysecondPoints = num_points / 32;
320 
321  const float* inputVectorPtr = (const float*)inputVector;
322  int8_t* outputVectorPtr = outputVector;
323 
324  float min_val = INT8_MIN;
325  float max_val = INT8_MAX;
326  float r;
327 
328  __m256 vScalar = _mm256_set1_ps(scalar);
329  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
330  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
331  __m256 vmin_val = _mm256_set1_ps(min_val);
332  __m256 vmax_val = _mm256_set1_ps(max_val);
333  __m256i intInputVal;
334 
335  for (; number < thirtysecondPoints; number++) {
336  inputVal1 = _mm256_load_ps(inputVectorPtr);
337  inputVectorPtr += 8;
338  inputVal2 = _mm256_load_ps(inputVectorPtr);
339  inputVectorPtr += 8;
340  inputVal3 = _mm256_load_ps(inputVectorPtr);
341  inputVectorPtr += 8;
342  inputVal4 = _mm256_load_ps(inputVectorPtr);
343  inputVectorPtr += 8;
344 
345  inputVal1 = _mm256_max_ps(
346  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
347  inputVal2 = _mm256_max_ps(
348  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
349  inputVal3 = _mm256_max_ps(
350  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
351  inputVal4 = _mm256_max_ps(
352  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
353 
354  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
355  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
356  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
357  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
358 
359  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
360  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
361  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
362  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
363 
364  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
365  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
366 
367  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
368  outputVectorPtr += 32;
369  }
370 
371  number = thirtysecondPoints * 32;
372  for (; number < num_points; number++) {
373  r = inputVector[number] * scalar;
374  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
375  }
376 }
377 
378 #endif /* LV_HAVE_AVX2 */
379 
380 
381 #ifdef LV_HAVE_SSE2
382 #include <emmintrin.h>
383 
384 static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
385  const float* inputVector,
386  const float scalar,
387  unsigned int num_points)
388 {
389  unsigned int number = 0;
390 
391  const unsigned int sixteenthPoints = num_points / 16;
392 
393  const float* inputVectorPtr = (const float*)inputVector;
394  int8_t* outputVectorPtr = outputVector;
395 
396  float min_val = INT8_MIN;
397  float max_val = INT8_MAX;
398  float r;
399 
400  __m128 vScalar = _mm_set_ps1(scalar);
401  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
402  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
403  __m128 vmin_val = _mm_set_ps1(min_val);
404  __m128 vmax_val = _mm_set_ps1(max_val);
405 
406  for (; number < sixteenthPoints; number++) {
407  inputVal1 = _mm_load_ps(inputVectorPtr);
408  inputVectorPtr += 4;
409  inputVal2 = _mm_load_ps(inputVectorPtr);
410  inputVectorPtr += 4;
411  inputVal3 = _mm_load_ps(inputVectorPtr);
412  inputVectorPtr += 4;
413  inputVal4 = _mm_load_ps(inputVectorPtr);
414  inputVectorPtr += 4;
415 
416  inputVal1 =
417  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
418  inputVal2 =
419  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
420  inputVal3 =
421  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
422  inputVal4 =
423  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
424 
425  intInputVal1 = _mm_cvtps_epi32(inputVal1);
426  intInputVal2 = _mm_cvtps_epi32(inputVal2);
427  intInputVal3 = _mm_cvtps_epi32(inputVal3);
428  intInputVal4 = _mm_cvtps_epi32(inputVal4);
429 
430  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
431  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
432 
433  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
434 
435  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
436  outputVectorPtr += 16;
437  }
438 
439  number = sixteenthPoints * 16;
440  for (; number < num_points; number++) {
441  r = inputVector[number] * scalar;
442  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
443  }
444 }
445 #endif /* LV_HAVE_SSE2 */
446 
447 
448 #ifdef LV_HAVE_SSE
449 #include <xmmintrin.h>
450 
451 static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
452  const float* inputVector,
453  const float scalar,
454  unsigned int num_points)
455 {
456  unsigned int number = 0;
457  size_t inner_loop;
458 
459  const unsigned int quarterPoints = num_points / 4;
460 
461  const float* inputVectorPtr = (const float*)inputVector;
462 
463  float min_val = INT8_MIN;
464  float max_val = INT8_MAX;
465  float r;
466 
467  int8_t* outputVectorPtr = outputVector;
468  __m128 vScalar = _mm_set_ps1(scalar);
469  __m128 ret;
470  __m128 vmin_val = _mm_set_ps1(min_val);
471  __m128 vmax_val = _mm_set_ps1(max_val);
472 
473  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
474 
475  for (; number < quarterPoints; number++) {
476  ret = _mm_load_ps(inputVectorPtr);
477  inputVectorPtr += 4;
478 
479  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
480 
481  _mm_store_ps(outputFloatBuffer, ret);
482  for (inner_loop = 0; inner_loop < 4; inner_loop++) {
483  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
484  }
485  }
486 
487  number = quarterPoints * 4;
488  for (; number < num_points; number++) {
489  r = inputVector[number] * scalar;
490  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
491  }
492 }
493 
494 #endif /* LV_HAVE_SSE */
495 
496 
497 #ifdef LV_HAVE_GENERIC
498 
499 static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
500  const float* inputVector,
501  const float scalar,
502  unsigned int num_points)
503 {
504  const float* inputVectorPtr = inputVector;
505  unsigned int number = 0;
506  float r;
507 
508  for (number = 0; number < num_points; number++) {
509  r = *inputVectorPtr++ * scalar;
510  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
511  }
512 }
513 
514 #endif /* LV_HAVE_GENERIC */
515 
516 
517 #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:384
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:79
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:235
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:499
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:451
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:283
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:167
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56