Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_s32f_32f_fm_detect_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
57 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
58 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 
63 #ifdef LV_HAVE_AVX
64 #include <immintrin.h>
65 
66 static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
67  const float* inputVector,
68  const float bound,
69  float* saveValue,
70  unsigned int num_points)
71 {
72  if (num_points < 1) {
73  return;
74  }
75  unsigned int number = 1;
76  unsigned int j = 0;
77  // num_points-1 keeps Fedora 7's gcc from crashing...
78  // num_points won't work. :(
79  const unsigned int eighthPoints = (num_points - 1) / 8;
80 
81  float* outPtr = outputVector;
82  const float* inPtr = inputVector;
83  __m256 upperBound = _mm256_set1_ps(bound);
84  __m256 lowerBound = _mm256_set1_ps(-bound);
85  __m256 next3old1;
86  __m256 next4;
87  __m256 boundAdjust;
88  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
89  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
90  // Do the first 8 by hand since we're going in from the saveValue:
91  *outPtr = *inPtr - *saveValue;
92  if (*outPtr > bound)
93  *outPtr -= 2 * bound;
94  if (*outPtr < -bound)
95  *outPtr += 2 * bound;
96  inPtr++;
97  outPtr++;
98  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
99  *outPtr = *(inPtr) - *(inPtr - 1);
100  if (*outPtr > bound)
101  *outPtr -= 2 * bound;
102  if (*outPtr < -bound)
103  *outPtr += 2 * bound;
104  inPtr++;
105  outPtr++;
106  }
107 
108  for (; number < eighthPoints; number++) {
109  // Load data
110  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
111  next4 = _mm256_load_ps(inPtr);
112  inPtr += 8;
113  // Subtract and store:
114  next3old1 = _mm256_sub_ps(next4, next3old1);
115  // Bound:
116  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
117  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
118  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
119  next4 = _mm256_and_ps(next4, negBoundAdjust);
120  boundAdjust = _mm256_or_ps(next4, boundAdjust);
121  // Make sure we're in the bounding interval:
122  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
123  _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
124  outPtr += 8;
125  }
126 
127  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
128  number++) {
129  *outPtr = *(inPtr) - *(inPtr - 1);
130  if (*outPtr > bound)
131  *outPtr -= 2 * bound;
132  if (*outPtr < -bound)
133  *outPtr += 2 * bound;
134  inPtr++;
135  outPtr++;
136  }
137 
138  *saveValue = inputVector[num_points - 1];
139 }
140 #endif /* LV_HAVE_AVX */
141 
142 
143 #ifdef LV_HAVE_SSE
144 #include <xmmintrin.h>
145 
146 static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
147  const float* inputVector,
148  const float bound,
149  float* saveValue,
150  unsigned int num_points)
151 {
152  if (num_points < 1) {
153  return;
154  }
155  unsigned int number = 1;
156  unsigned int j = 0;
157  // num_points-1 keeps Fedora 7's gcc from crashing...
158  // num_points won't work. :(
159  const unsigned int quarterPoints = (num_points - 1) / 4;
160 
161  float* outPtr = outputVector;
162  const float* inPtr = inputVector;
163  __m128 upperBound = _mm_set_ps1(bound);
164  __m128 lowerBound = _mm_set_ps1(-bound);
165  __m128 next3old1;
166  __m128 next4;
167  __m128 boundAdjust;
168  __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
169  __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
170  // Do the first 4 by hand since we're going in from the saveValue:
171  *outPtr = *inPtr - *saveValue;
172  if (*outPtr > bound)
173  *outPtr -= 2 * bound;
174  if (*outPtr < -bound)
175  *outPtr += 2 * bound;
176  inPtr++;
177  outPtr++;
178  for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
179  *outPtr = *(inPtr) - *(inPtr - 1);
180  if (*outPtr > bound)
181  *outPtr -= 2 * bound;
182  if (*outPtr < -bound)
183  *outPtr += 2 * bound;
184  inPtr++;
185  outPtr++;
186  }
187 
188  for (; number < quarterPoints; number++) {
189  // Load data
190  next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
191  next4 = _mm_load_ps(inPtr);
192  inPtr += 4;
193  // Subtract and store:
194  next3old1 = _mm_sub_ps(next4, next3old1);
195  // Bound:
196  boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
197  boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
198  next4 = _mm_cmplt_ps(next3old1, lowerBound);
199  next4 = _mm_and_ps(next4, negBoundAdjust);
200  boundAdjust = _mm_or_ps(next4, boundAdjust);
201  // Make sure we're in the bounding interval:
202  next3old1 = _mm_add_ps(next3old1, boundAdjust);
203  _mm_store_ps(outPtr, next3old1); // Store the results back into the output
204  outPtr += 4;
205  }
206 
207  for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
208  number < num_points;
209  number++) {
210  *outPtr = *(inPtr) - *(inPtr - 1);
211  if (*outPtr > bound)
212  *outPtr -= 2 * bound;
213  if (*outPtr < -bound)
214  *outPtr += 2 * bound;
215  inPtr++;
216  outPtr++;
217  }
218 
219  *saveValue = inputVector[num_points - 1];
220 }
221 #endif /* LV_HAVE_SSE */
222 
223 #ifdef LV_HAVE_GENERIC
224 
225 static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
226  const float* inputVector,
227  const float bound,
228  float* saveValue,
229  unsigned int num_points)
230 {
231  if (num_points < 1) {
232  return;
233  }
234  unsigned int number = 0;
235  float* outPtr = outputVector;
236  const float* inPtr = inputVector;
237 
238  // Do the first 1 by hand since we're going in from the saveValue:
239  *outPtr = *inPtr - *saveValue;
240  if (*outPtr > bound)
241  *outPtr -= 2 * bound;
242  if (*outPtr < -bound)
243  *outPtr += 2 * bound;
244  inPtr++;
245  outPtr++;
246 
247  for (number = 1; number < num_points; number++) {
248  *outPtr = *(inPtr) - *(inPtr - 1);
249  if (*outPtr > bound)
250  *outPtr -= 2 * bound;
251  if (*outPtr < -bound)
252  *outPtr += 2 * bound;
253  inPtr++;
254  outPtr++;
255  }
256 
257  *saveValue = inputVector[num_points - 1];
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 
262 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
263 
264 
265 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
266 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
267 
268 #include <inttypes.h>
269 #include <stdio.h>
270 
271 #ifdef LV_HAVE_AVX
272 #include <immintrin.h>
273 
274 static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
275  const float* inputVector,
276  const float bound,
277  float* saveValue,
278  unsigned int num_points)
279 {
280  if (num_points < 1) {
281  return;
282  }
283  unsigned int number = 1;
284  unsigned int j = 0;
285  // num_points-1 keeps Fedora 7's gcc from crashing...
286  // num_points won't work. :(
287  const unsigned int eighthPoints = (num_points - 1) / 8;
288 
289  float* outPtr = outputVector;
290  const float* inPtr = inputVector;
291  __m256 upperBound = _mm256_set1_ps(bound);
292  __m256 lowerBound = _mm256_set1_ps(-bound);
293  __m256 next3old1;
294  __m256 next4;
295  __m256 boundAdjust;
296  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
297  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
298  // Do the first 8 by hand since we're going in from the saveValue:
299  *outPtr = *inPtr - *saveValue;
300  if (*outPtr > bound)
301  *outPtr -= 2 * bound;
302  if (*outPtr < -bound)
303  *outPtr += 2 * bound;
304  inPtr++;
305  outPtr++;
306  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
307  *outPtr = *(inPtr) - *(inPtr - 1);
308  if (*outPtr > bound)
309  *outPtr -= 2 * bound;
310  if (*outPtr < -bound)
311  *outPtr += 2 * bound;
312  inPtr++;
313  outPtr++;
314  }
315 
316  for (; number < eighthPoints; number++) {
317  // Load data
318  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
319  next4 = _mm256_loadu_ps(inPtr);
320  inPtr += 8;
321  // Subtract and store:
322  next3old1 = _mm256_sub_ps(next4, next3old1);
323  // Bound:
324  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
325  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
326  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
327  next4 = _mm256_and_ps(next4, negBoundAdjust);
328  boundAdjust = _mm256_or_ps(next4, boundAdjust);
329  // Make sure we're in the bounding interval:
330  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
331  _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
332  outPtr += 8;
333  }
334 
335  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
336  number++) {
337  *outPtr = *(inPtr) - *(inPtr - 1);
338  if (*outPtr > bound)
339  *outPtr -= 2 * bound;
340  if (*outPtr < -bound)
341  *outPtr += 2 * bound;
342  inPtr++;
343  outPtr++;
344  }
345 
346  *saveValue = inputVector[num_points - 1];
347 }
348 #endif /* LV_HAVE_AVX */
349 
350 
351 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
static void volk_32f_s32f_32f_fm_detect_32f_a_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:66
static void volk_32f_s32f_32f_fm_detect_32f_u_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:274
static void volk_32f_s32f_32f_fm_detect_32f_a_sse(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:146
static void volk_32f_s32f_32f_fm_detect_32f_generic(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:225