Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
64  const int8_t* inputVector,
65  const float scalar,
66  unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const unsigned int sixteenthPoints = num_points / 16;
70 
71  float* outputVectorPtr = outputVector;
72  const float iScalar = 1.0 / scalar;
73  __m256 invScalar = _mm256_set1_ps(iScalar);
74  const int8_t* inputVectorPtr = inputVector;
75  __m256 ret;
76  __m128i inputVal128;
77  __m256i interimVal;
78 
79  for (; number < sixteenthPoints; number++) {
80  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
81 
82  interimVal = _mm256_cvtepi8_epi32(inputVal128);
83  ret = _mm256_cvtepi32_ps(interimVal);
84  ret = _mm256_mul_ps(ret, invScalar);
85  _mm256_storeu_ps(outputVectorPtr, ret);
86  outputVectorPtr += 8;
87 
88  inputVal128 = _mm_srli_si128(inputVal128, 8);
89  interimVal = _mm256_cvtepi8_epi32(inputVal128);
90  ret = _mm256_cvtepi32_ps(interimVal);
91  ret = _mm256_mul_ps(ret, invScalar);
92  _mm256_storeu_ps(outputVectorPtr, ret);
93  outputVectorPtr += 8;
94 
95  inputVectorPtr += 16;
96  }
97 
98  number = sixteenthPoints * 16;
99  for (; number < num_points; number++) {
100  outputVector[number] = (float)(inputVector[number]) * iScalar;
101  }
102 }
103 #endif /* LV_HAVE_AVX2 */
104 
105 
106 #ifdef LV_HAVE_SSE4_1
107 #include <smmintrin.h>
108 
109 static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
110  const int8_t* inputVector,
111  const float scalar,
112  unsigned int num_points)
113 {
114  unsigned int number = 0;
115  const unsigned int sixteenthPoints = num_points / 16;
116 
117  float* outputVectorPtr = outputVector;
118  const float iScalar = 1.0 / scalar;
119  __m128 invScalar = _mm_set_ps1(iScalar);
120  const int8_t* inputVectorPtr = inputVector;
121  __m128 ret;
122  __m128i inputVal;
123  __m128i interimVal;
124 
125  for (; number < sixteenthPoints; number++) {
126  inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
127 
128  interimVal = _mm_cvtepi8_epi32(inputVal);
129  ret = _mm_cvtepi32_ps(interimVal);
130  ret = _mm_mul_ps(ret, invScalar);
131  _mm_storeu_ps(outputVectorPtr, ret);
132  outputVectorPtr += 4;
133 
134  inputVal = _mm_srli_si128(inputVal, 4);
135  interimVal = _mm_cvtepi8_epi32(inputVal);
136  ret = _mm_cvtepi32_ps(interimVal);
137  ret = _mm_mul_ps(ret, invScalar);
138  _mm_storeu_ps(outputVectorPtr, ret);
139  outputVectorPtr += 4;
140 
141  inputVal = _mm_srli_si128(inputVal, 4);
142  interimVal = _mm_cvtepi8_epi32(inputVal);
143  ret = _mm_cvtepi32_ps(interimVal);
144  ret = _mm_mul_ps(ret, invScalar);
145  _mm_storeu_ps(outputVectorPtr, ret);
146  outputVectorPtr += 4;
147 
148  inputVal = _mm_srli_si128(inputVal, 4);
149  interimVal = _mm_cvtepi8_epi32(inputVal);
150  ret = _mm_cvtepi32_ps(interimVal);
151  ret = _mm_mul_ps(ret, invScalar);
152  _mm_storeu_ps(outputVectorPtr, ret);
153  outputVectorPtr += 4;
154 
155  inputVectorPtr += 16;
156  }
157 
158  number = sixteenthPoints * 16;
159  for (; number < num_points; number++) {
160  outputVector[number] = (float)(inputVector[number]) * iScalar;
161  }
162 }
163 #endif /* LV_HAVE_SSE4_1 */
164 
165 #ifdef LV_HAVE_GENERIC
166 
167 static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
168  const int8_t* inputVector,
169  const float scalar,
170  unsigned int num_points)
171 {
172  float* outputVectorPtr = outputVector;
173  const int8_t* inputVectorPtr = inputVector;
174  unsigned int number = 0;
175  const float iScalar = 1.0 / scalar;
176 
177  for (number = 0; number < num_points; number++) {
178  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
179  }
180 }
181 #endif /* LV_HAVE_GENERIC */
182 
183 
184 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
185 
186 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
187 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
188 
189 #include <inttypes.h>
190 #include <stdio.h>
191 
192 #ifdef LV_HAVE_AVX2
193 #include <immintrin.h>
194 
195 static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
196  const int8_t* inputVector,
197  const float scalar,
198  unsigned int num_points)
199 {
200  unsigned int number = 0;
201  const unsigned int sixteenthPoints = num_points / 16;
202 
203  float* outputVectorPtr = outputVector;
204  const float iScalar = 1.0 / scalar;
205  __m256 invScalar = _mm256_set1_ps(iScalar);
206  const int8_t* inputVectorPtr = inputVector;
207  __m256 ret;
208  __m128i inputVal128;
209  __m256i interimVal;
210 
211  for (; number < sixteenthPoints; number++) {
212  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
213 
214  interimVal = _mm256_cvtepi8_epi32(inputVal128);
215  ret = _mm256_cvtepi32_ps(interimVal);
216  ret = _mm256_mul_ps(ret, invScalar);
217  _mm256_store_ps(outputVectorPtr, ret);
218  outputVectorPtr += 8;
219 
220  inputVal128 = _mm_srli_si128(inputVal128, 8);
221  interimVal = _mm256_cvtepi8_epi32(inputVal128);
222  ret = _mm256_cvtepi32_ps(interimVal);
223  ret = _mm256_mul_ps(ret, invScalar);
224  _mm256_store_ps(outputVectorPtr, ret);
225  outputVectorPtr += 8;
226 
227  inputVectorPtr += 16;
228  }
229 
230  number = sixteenthPoints * 16;
231  for (; number < num_points; number++) {
232  outputVector[number] = (float)(inputVector[number]) * iScalar;
233  }
234 }
235 #endif /* LV_HAVE_AVX2 */
236 
237 #ifdef LV_HAVE_SSE4_1
238 #include <smmintrin.h>
239 
240 static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
241  const int8_t* inputVector,
242  const float scalar,
243  unsigned int num_points)
244 {
245  unsigned int number = 0;
246  const unsigned int sixteenthPoints = num_points / 16;
247 
248  float* outputVectorPtr = outputVector;
249  const float iScalar = 1.0 / scalar;
250  __m128 invScalar = _mm_set_ps1(iScalar);
251  const int8_t* inputVectorPtr = inputVector;
252  __m128 ret;
253  __m128i inputVal;
254  __m128i interimVal;
255 
256  for (; number < sixteenthPoints; number++) {
257  inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
258 
259  interimVal = _mm_cvtepi8_epi32(inputVal);
260  ret = _mm_cvtepi32_ps(interimVal);
261  ret = _mm_mul_ps(ret, invScalar);
262  _mm_store_ps(outputVectorPtr, ret);
263  outputVectorPtr += 4;
264 
265  inputVal = _mm_srli_si128(inputVal, 4);
266  interimVal = _mm_cvtepi8_epi32(inputVal);
267  ret = _mm_cvtepi32_ps(interimVal);
268  ret = _mm_mul_ps(ret, invScalar);
269  _mm_store_ps(outputVectorPtr, ret);
270  outputVectorPtr += 4;
271 
272  inputVal = _mm_srli_si128(inputVal, 4);
273  interimVal = _mm_cvtepi8_epi32(inputVal);
274  ret = _mm_cvtepi32_ps(interimVal);
275  ret = _mm_mul_ps(ret, invScalar);
276  _mm_store_ps(outputVectorPtr, ret);
277  outputVectorPtr += 4;
278 
279  inputVal = _mm_srli_si128(inputVal, 4);
280  interimVal = _mm_cvtepi8_epi32(inputVal);
281  ret = _mm_cvtepi32_ps(interimVal);
282  ret = _mm_mul_ps(ret, invScalar);
283  _mm_store_ps(outputVectorPtr, ret);
284  outputVectorPtr += 4;
285 
286  inputVectorPtr += 16;
287  }
288 
289  number = sixteenthPoints * 16;
290  for (; number < num_points; number++) {
291  outputVector[number] = (float)(inputVector[number]) * iScalar;
292  }
293 }
294 #endif /* LV_HAVE_SSE4_1 */
295 
296 #ifdef LV_HAVE_NEON
297 #include <arm_neon.h>
298 
299 static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
300  const int8_t* inputVector,
301  const float scalar,
302  unsigned int num_points)
303 {
304  float* outputVectorPtr = outputVector;
305  const int8_t* inputVectorPtr = inputVector;
306 
307  const float iScalar = 1.0 / scalar;
308  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
309 
310  int8x8x2_t inputVal;
311  float32x4x2_t outputFloat;
312  int16x8_t tmp;
313 
314  unsigned int number = 0;
315  const unsigned int sixteenthPoints = num_points / 16;
316  for (; number < sixteenthPoints; number++) {
317  __VOLK_PREFETCH(inputVectorPtr + 16);
318 
319  inputVal = vld2_s8(inputVectorPtr);
320  inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
321  inputVectorPtr += 16;
322 
323  tmp = vmovl_s8(inputVal.val[0]);
324 
325  outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
326  outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
327  vst1q_f32(outputVectorPtr, outputFloat.val[0]);
328  outputVectorPtr += 4;
329 
330  outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
331  outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
332  vst1q_f32(outputVectorPtr, outputFloat.val[1]);
333  outputVectorPtr += 4;
334 
335  tmp = vmovl_s8(inputVal.val[1]);
336 
337  outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
338  outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
339  vst1q_f32(outputVectorPtr, outputFloat.val[0]);
340  outputVectorPtr += 4;
341 
342  outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
343  outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
344  vst1q_f32(outputVectorPtr, outputFloat.val[1]);
345  outputVectorPtr += 4;
346  }
347  for (number = sixteenthPoints * 16; number < num_points; number++) {
348  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
349  }
350 }
351 
352 #endif /* LV_HAVE_NEON */
353 
354 #ifdef LV_HAVE_GENERIC
355 
356 static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
357  const int8_t* inputVector,
358  const float scalar,
359  unsigned int num_points)
360 {
361  float* outputVectorPtr = outputVector;
362  const int8_t* inputVectorPtr = inputVector;
363  unsigned int number = 0;
364  const float iScalar = 1.0 / scalar;
365 
366  for (number = 0; number < num_points; number++) {
367  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
368  }
369 }
370 #endif /* LV_HAVE_GENERIC */
371 
372 
373 #ifdef LV_HAVE_ORC
374 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
375  const int8_t* inputVector,
376  const float scalar,
377  unsigned int num_points);
378 
379 static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
380  const int8_t* inputVector,
381  const float scalar,
382  unsigned int num_points)
383 {
384  float invscalar = 1.0 / scalar;
385  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
386 }
387 #endif /* LV_HAVE_ORC */
388 
389 
390 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:356
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:167
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:299
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62