Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
69 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
70 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
71 
72 #include <inttypes.h>
73 #include <math.h>
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 
77 #ifdef LV_HAVE_AVX
78 #include <immintrin.h>
79 
80 static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
81  float* mean,
82  const float* inputBuffer,
83  unsigned int num_points)
84 {
85  float stdDev = 0;
86  float newMean = 0;
87  if (num_points > 0) {
88  unsigned int number = 0;
89  const unsigned int thirtySecondthPoints = num_points / 32;
90 
91  const float* aPtr = inputBuffer;
92  __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
93  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
94 
95  __m256 accumulator = _mm256_setzero_ps();
96  __m256 squareAccumulator = _mm256_setzero_ps();
97  __m256 aVal1, aVal2, aVal3, aVal4;
98  __m256 cVal1, cVal2, cVal3, cVal4;
99  for (; number < thirtySecondthPoints; number++) {
100  aVal1 = _mm256_load_ps(aPtr);
101  aPtr += 8;
102  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
103  accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
104 
105  aVal2 = _mm256_load_ps(aPtr);
106  aPtr += 8;
107  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
108  accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
109 
110  aVal3 = _mm256_load_ps(aPtr);
111  aPtr += 8;
112  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
113  accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
114 
115  aVal4 = _mm256_load_ps(aPtr);
116  aPtr += 8;
117  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
118  accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
119 
120  cVal1 = _mm256_or_ps(cVal1, cVal2);
121  cVal3 = _mm256_or_ps(cVal3, cVal4);
122  cVal1 = _mm256_or_ps(cVal1, cVal3);
123 
124  squareAccumulator =
125  _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
126  }
127  _mm256_store_ps(meanBuffer,
128  accumulator); // Store the results back into the C container
129  _mm256_store_ps(squareBuffer,
130  squareAccumulator); // Store the results back into the C container
131  newMean = meanBuffer[0];
132  newMean += meanBuffer[1];
133  newMean += meanBuffer[2];
134  newMean += meanBuffer[3];
135  newMean += meanBuffer[4];
136  newMean += meanBuffer[5];
137  newMean += meanBuffer[6];
138  newMean += meanBuffer[7];
139  stdDev = squareBuffer[0];
140  stdDev += squareBuffer[1];
141  stdDev += squareBuffer[2];
142  stdDev += squareBuffer[3];
143  stdDev += squareBuffer[4];
144  stdDev += squareBuffer[5];
145  stdDev += squareBuffer[6];
146  stdDev += squareBuffer[7];
147 
148  number = thirtySecondthPoints * 32;
149  for (; number < num_points; number++) {
150  stdDev += (*aPtr) * (*aPtr);
151  newMean += *aPtr++;
152  }
153  newMean /= num_points;
154  stdDev /= num_points;
155  stdDev -= (newMean * newMean);
156  stdDev = sqrtf(stdDev);
157  }
158  *stddev = stdDev;
159  *mean = newMean;
160 }
161 #endif /* LV_HAVE_AVX */
162 
163 
164 #ifdef LV_HAVE_AVX
165 #include <immintrin.h>
166 
167 static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
168  float* mean,
169  const float* inputBuffer,
170  unsigned int num_points)
171 {
172  float stdDev = 0;
173  float newMean = 0;
174  if (num_points > 0) {
175  unsigned int number = 0;
176  const unsigned int thirtySecondthPoints = num_points / 32;
177 
178  const float* aPtr = inputBuffer;
179  __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
180  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
181 
182  __m256 accumulator = _mm256_setzero_ps();
183  __m256 squareAccumulator = _mm256_setzero_ps();
184  __m256 aVal1, aVal2, aVal3, aVal4;
185  __m256 cVal1, cVal2, cVal3, cVal4;
186  for (; number < thirtySecondthPoints; number++) {
187  aVal1 = _mm256_loadu_ps(aPtr);
188  aPtr += 8;
189  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
190  accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
191 
192  aVal2 = _mm256_loadu_ps(aPtr);
193  aPtr += 8;
194  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
195  accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
196 
197  aVal3 = _mm256_loadu_ps(aPtr);
198  aPtr += 8;
199  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
200  accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
201 
202  aVal4 = _mm256_loadu_ps(aPtr);
203  aPtr += 8;
204  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
205  accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
206 
207  cVal1 = _mm256_or_ps(cVal1, cVal2);
208  cVal3 = _mm256_or_ps(cVal3, cVal4);
209  cVal1 = _mm256_or_ps(cVal1, cVal3);
210 
211  squareAccumulator =
212  _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
213  }
214  _mm256_store_ps(meanBuffer,
215  accumulator); // Store the results back into the C container
216  _mm256_store_ps(squareBuffer,
217  squareAccumulator); // Store the results back into the C container
218  newMean = meanBuffer[0];
219  newMean += meanBuffer[1];
220  newMean += meanBuffer[2];
221  newMean += meanBuffer[3];
222  newMean += meanBuffer[4];
223  newMean += meanBuffer[5];
224  newMean += meanBuffer[6];
225  newMean += meanBuffer[7];
226  stdDev = squareBuffer[0];
227  stdDev += squareBuffer[1];
228  stdDev += squareBuffer[2];
229  stdDev += squareBuffer[3];
230  stdDev += squareBuffer[4];
231  stdDev += squareBuffer[5];
232  stdDev += squareBuffer[6];
233  stdDev += squareBuffer[7];
234 
235  number = thirtySecondthPoints * 32;
236  for (; number < num_points; number++) {
237  stdDev += (*aPtr) * (*aPtr);
238  newMean += *aPtr++;
239  }
240  newMean /= num_points;
241  stdDev /= num_points;
242  stdDev -= (newMean * newMean);
243  stdDev = sqrtf(stdDev);
244  }
245  *stddev = stdDev;
246  *mean = newMean;
247 }
248 #endif /* LV_HAVE_AVX */
249 
250 
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
253 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev,
254  float* mean,
255  const float* inputBuffer,
256  unsigned int num_points)
257 {
258  float returnValue = 0;
259  float newMean = 0;
260  if (num_points > 0) {
261  unsigned int number = 0;
262  const unsigned int sixteenthPoints = num_points / 16;
263 
264  const float* aPtr = inputBuffer;
265  __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
266  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
267 
268  __m128 accumulator = _mm_setzero_ps();
269  __m128 squareAccumulator = _mm_setzero_ps();
270  __m128 aVal1, aVal2, aVal3, aVal4;
271  __m128 cVal1, cVal2, cVal3, cVal4;
272  for (; number < sixteenthPoints; number++) {
273  aVal1 = _mm_load_ps(aPtr);
274  aPtr += 4;
275  cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
276  accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
277 
278  aVal2 = _mm_load_ps(aPtr);
279  aPtr += 4;
280  cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
281  accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
282 
283  aVal3 = _mm_load_ps(aPtr);
284  aPtr += 4;
285  cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
286  accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
287 
288  aVal4 = _mm_load_ps(aPtr);
289  aPtr += 4;
290  cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
291  accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
292 
293  cVal1 = _mm_or_ps(cVal1, cVal2);
294  cVal3 = _mm_or_ps(cVal3, cVal4);
295  cVal1 = _mm_or_ps(cVal1, cVal3);
296 
297  squareAccumulator =
298  _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
299  }
300  _mm_store_ps(meanBuffer,
301  accumulator); // Store the results back into the C container
302  _mm_store_ps(squareBuffer,
303  squareAccumulator); // Store the results back into the C container
304  newMean = meanBuffer[0];
305  newMean += meanBuffer[1];
306  newMean += meanBuffer[2];
307  newMean += meanBuffer[3];
308  returnValue = squareBuffer[0];
309  returnValue += squareBuffer[1];
310  returnValue += squareBuffer[2];
311  returnValue += squareBuffer[3];
312 
313  number = sixteenthPoints * 16;
314  for (; number < num_points; number++) {
315  returnValue += (*aPtr) * (*aPtr);
316  newMean += *aPtr++;
317  }
318  newMean /= num_points;
319  returnValue /= num_points;
320  returnValue -= (newMean * newMean);
321  returnValue = sqrtf(returnValue);
322  }
323  *stddev = returnValue;
324  *mean = newMean;
325 }
326 #endif /* LV_HAVE_SSE4_1 */
327 
328 
329 #ifdef LV_HAVE_SSE
330 #include <xmmintrin.h>
331 
332 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
333  float* mean,
334  const float* inputBuffer,
335  unsigned int num_points)
336 {
337  float returnValue = 0;
338  float newMean = 0;
339  if (num_points > 0) {
340  unsigned int number = 0;
341  const unsigned int quarterPoints = num_points / 4;
342 
343  const float* aPtr = inputBuffer;
344  __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
345  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
346 
347  __m128 accumulator = _mm_setzero_ps();
348  __m128 squareAccumulator = _mm_setzero_ps();
349  __m128 aVal = _mm_setzero_ps();
350  for (; number < quarterPoints; number++) {
351  aVal = _mm_load_ps(aPtr); // aVal = x
352  accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
353  aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
354  squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
355  aPtr += 4;
356  }
357  _mm_store_ps(meanBuffer,
358  accumulator); // Store the results back into the C container
359  _mm_store_ps(squareBuffer,
360  squareAccumulator); // Store the results back into the C container
361  newMean = meanBuffer[0];
362  newMean += meanBuffer[1];
363  newMean += meanBuffer[2];
364  newMean += meanBuffer[3];
365  returnValue = squareBuffer[0];
366  returnValue += squareBuffer[1];
367  returnValue += squareBuffer[2];
368  returnValue += squareBuffer[3];
369 
370  number = quarterPoints * 4;
371  for (; number < num_points; number++) {
372  returnValue += (*aPtr) * (*aPtr);
373  newMean += *aPtr++;
374  }
375  newMean /= num_points;
376  returnValue /= num_points;
377  returnValue -= (newMean * newMean);
378  returnValue = sqrtf(returnValue);
379  }
380  *stddev = returnValue;
381  *mean = newMean;
382 }
383 #endif /* LV_HAVE_SSE */
384 
385 
386 #ifdef LV_HAVE_GENERIC
387 
388 static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
389  float* mean,
390  const float* inputBuffer,
391  unsigned int num_points)
392 {
393  float returnValue = 0;
394  float newMean = 0;
395  if (num_points > 0) {
396  const float* aPtr = inputBuffer;
397  unsigned int number = 0;
398 
399  for (number = 0; number < num_points; number++) {
400  returnValue += (*aPtr) * (*aPtr);
401  newMean += *aPtr++;
402  }
403  newMean /= num_points;
404  returnValue /= num_points;
405  returnValue -= (newMean * newMean);
406  returnValue = sqrtf(returnValue);
407  }
408  *stddev = returnValue;
409  *mean = newMean;
410 }
411 #endif /* LV_HAVE_GENERIC */
412 
413 
414 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:167
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:388
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:80
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:332
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56