Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <inttypes.h>
73 #include <math.h>
74 #include <stdio.h>
75 
76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
78 
79 #if LV_HAVE_AVX2 && LV_HAVE_FMA
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  float* bPtr = bVector;
86  const float* aPtr = aVector;
87 
88  unsigned int number = 0;
89  unsigned int eighthPoints = num_points / 8;
90  unsigned int i = 0;
91 
92  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
93  fones, fzeroes;
94  __m256 sine, cosine;
95  __m256i q, ones, twos, fours;
96 
97  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
98  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
99  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
100  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101  ffours = _mm256_set1_ps(4.0);
102  ftwos = _mm256_set1_ps(2.0);
103  fones = _mm256_set1_ps(1.0);
104  fzeroes = _mm256_setzero_ps();
105  __m256i zeroes = _mm256_set1_epi32(0);
106  ones = _mm256_set1_epi32(1);
107  __m256i allones = _mm256_set1_epi32(0xffffffff);
108  twos = _mm256_set1_epi32(2);
109  fours = _mm256_set1_epi32(4);
110 
111  cp1 = _mm256_set1_ps(1.0);
112  cp2 = _mm256_set1_ps(0.08333333333333333);
113  cp3 = _mm256_set1_ps(0.002777777777777778);
114  cp4 = _mm256_set1_ps(4.96031746031746e-05);
115  cp5 = _mm256_set1_ps(5.511463844797178e-07);
116  union bit256 condition1;
117  union bit256 condition3;
118 
119  for (; number < eighthPoints; number++) {
120 
121  aVal = _mm256_load_ps(aPtr);
122  // s = fabs(aVal)
123  s = _mm256_sub_ps(aVal,
124  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
125  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
126  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
127  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
128  // r = q + q&1, q indicates quadrant, r gives
129  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
130 
131  s = _mm256_fnmadd_ps(r, pio4A, s);
132  s = _mm256_fnmadd_ps(r, pio4B, s);
133  s = _mm256_fnmadd_ps(r, pio4C, s);
134 
135  s = _mm256_div_ps(
136  s,
137  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
138  s = _mm256_mul_ps(s, s);
139  // Evaluate Taylor series
140  s = _mm256_mul_ps(
141  _mm256_fmadd_ps(
142  _mm256_fmsub_ps(
143  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
144  s,
145  cp1),
146  s);
147 
148  for (i = 0; i < 3; i++)
149  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
150  s = _mm256_div_ps(s, ftwos);
151 
152  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
153  cosine = _mm256_sub_ps(fones, s);
154 
155  // if(((q+1)&2) != 0) { cosine=sine;}
156  condition1.int_vec =
157  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
158  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
159 
160  // if(((q+2)&4) != 0) { cosine = -cosine;}
161  condition3.int_vec = _mm256_cmpeq_epi32(
162  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
163  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
164 
165  cosine = _mm256_add_ps(
166  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
167  cosine = _mm256_sub_ps(cosine,
168  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
169  condition3.float_vec));
170  _mm256_store_ps(bPtr, cosine);
171  aPtr += 8;
172  bPtr += 8;
173  }
174 
175  number = eighthPoints * 8;
176  for (; number < num_points; number++) {
177  *bPtr++ = cos(*aPtr++);
178  }
179 }
180 
181 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
182 
183 #ifdef LV_HAVE_AVX2
184 #include <immintrin.h>
185 
186 static inline void
187 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
188 {
189  float* bPtr = bVector;
190  const float* aPtr = aVector;
191 
192  unsigned int number = 0;
193  unsigned int eighthPoints = num_points / 8;
194  unsigned int i = 0;
195 
196  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
197  fones, fzeroes;
198  __m256 sine, cosine;
199  __m256i q, ones, twos, fours;
200 
201  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
202  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
203  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
204  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
205  ffours = _mm256_set1_ps(4.0);
206  ftwos = _mm256_set1_ps(2.0);
207  fones = _mm256_set1_ps(1.0);
208  fzeroes = _mm256_setzero_ps();
209  __m256i zeroes = _mm256_set1_epi32(0);
210  ones = _mm256_set1_epi32(1);
211  __m256i allones = _mm256_set1_epi32(0xffffffff);
212  twos = _mm256_set1_epi32(2);
213  fours = _mm256_set1_epi32(4);
214 
215  cp1 = _mm256_set1_ps(1.0);
216  cp2 = _mm256_set1_ps(0.08333333333333333);
217  cp3 = _mm256_set1_ps(0.002777777777777778);
218  cp4 = _mm256_set1_ps(4.96031746031746e-05);
219  cp5 = _mm256_set1_ps(5.511463844797178e-07);
220  union bit256 condition1;
221  union bit256 condition3;
222 
223  for (; number < eighthPoints; number++) {
224 
225  aVal = _mm256_load_ps(aPtr);
226  // s = fabs(aVal)
227  s = _mm256_sub_ps(aVal,
228  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
229  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
230  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
231  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
232  // r = q + q&1, q indicates quadrant, r gives
233  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
234 
235  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
236  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
237  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
238 
239  s = _mm256_div_ps(
240  s,
241  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
242  s = _mm256_mul_ps(s, s);
243  // Evaluate Taylor series
244  s = _mm256_mul_ps(
245  _mm256_add_ps(
246  _mm256_mul_ps(
247  _mm256_sub_ps(
248  _mm256_mul_ps(
249  _mm256_add_ps(
250  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
251  s),
252  cp3),
253  s),
254  cp2),
255  s),
256  cp1),
257  s);
258 
259  for (i = 0; i < 3; i++)
260  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
261  s = _mm256_div_ps(s, ftwos);
262 
263  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
264  cosine = _mm256_sub_ps(fones, s);
265 
266  // if(((q+1)&2) != 0) { cosine=sine;}
267  condition1.int_vec =
268  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
269  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
270 
271  // if(((q+2)&4) != 0) { cosine = -cosine;}
272  condition3.int_vec = _mm256_cmpeq_epi32(
273  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
274  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
275 
276  cosine = _mm256_add_ps(
277  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
278  cosine = _mm256_sub_ps(cosine,
279  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
280  condition3.float_vec));
281  _mm256_store_ps(bPtr, cosine);
282  aPtr += 8;
283  bPtr += 8;
284  }
285 
286  number = eighthPoints * 8;
287  for (; number < num_points; number++) {
288  *bPtr++ = cos(*aPtr++);
289  }
290 }
291 
292 #endif /* LV_HAVE_AVX2 for aligned */
293 
294 #ifdef LV_HAVE_SSE4_1
295 #include <smmintrin.h>
296 
297 static inline void
298 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
299 {
300  float* bPtr = bVector;
301  const float* aPtr = aVector;
302 
303  unsigned int number = 0;
304  unsigned int quarterPoints = num_points / 4;
305  unsigned int i = 0;
306 
307  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
308  fones, fzeroes;
309  __m128 sine, cosine;
310  __m128i q, ones, twos, fours;
311 
312  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
313  pio4A = _mm_set1_ps(0.7853981554508209228515625);
314  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
315  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
316  ffours = _mm_set1_ps(4.0);
317  ftwos = _mm_set1_ps(2.0);
318  fones = _mm_set1_ps(1.0);
319  fzeroes = _mm_setzero_ps();
320  __m128i zeroes = _mm_set1_epi32(0);
321  ones = _mm_set1_epi32(1);
322  __m128i allones = _mm_set1_epi32(0xffffffff);
323  twos = _mm_set1_epi32(2);
324  fours = _mm_set1_epi32(4);
325 
326  cp1 = _mm_set1_ps(1.0);
327  cp2 = _mm_set1_ps(0.08333333333333333);
328  cp3 = _mm_set1_ps(0.002777777777777778);
329  cp4 = _mm_set1_ps(4.96031746031746e-05);
330  cp5 = _mm_set1_ps(5.511463844797178e-07);
331  union bit128 condition1;
332  union bit128 condition3;
333 
334  for (; number < quarterPoints; number++) {
335 
336  aVal = _mm_load_ps(aPtr);
337  // s = fabs(aVal)
338  s = _mm_sub_ps(aVal,
339  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
340  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
341  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
342  // r = q + q&1, q indicates quadrant, r gives
343  r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
344 
345  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
346  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
347  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
348 
349  s = _mm_div_ps(
350  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
351  s = _mm_mul_ps(s, s);
352  // Evaluate Taylor series
353  s = _mm_mul_ps(
354  _mm_add_ps(
355  _mm_mul_ps(
356  _mm_sub_ps(
357  _mm_mul_ps(
358  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
359  cp3),
360  s),
361  cp2),
362  s),
363  cp1),
364  s);
365 
366  for (i = 0; i < 3; i++)
367  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
368  s = _mm_div_ps(s, ftwos);
369 
370  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
371  cosine = _mm_sub_ps(fones, s);
372 
373  // if(((q+1)&2) != 0) { cosine=sine;}
374  condition1.int_vec =
375  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
376  condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
377 
378  // if(((q+2)&4) != 0) { cosine = -cosine;}
379  condition3.int_vec =
380  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
381  condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
382 
383  cosine = _mm_add_ps(cosine,
384  _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
385  cosine = _mm_sub_ps(
386  cosine,
387  _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
388  _mm_store_ps(bPtr, cosine);
389  aPtr += 4;
390  bPtr += 4;
391  }
392 
393  number = quarterPoints * 4;
394  for (; number < num_points; number++) {
395  *bPtr++ = cosf(*aPtr++);
396  }
397 }
398 
399 #endif /* LV_HAVE_SSE4_1 for aligned */
400 
401 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
402 
403 
404 #ifndef INCLUDED_volk_32f_cos_32f_u_H
405 #define INCLUDED_volk_32f_cos_32f_u_H
406 
407 #if LV_HAVE_AVX2 && LV_HAVE_FMA
408 #include <immintrin.h>
409 
410 static inline void
411 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
412 {
413  float* bPtr = bVector;
414  const float* aPtr = aVector;
415 
416  unsigned int number = 0;
417  unsigned int eighthPoints = num_points / 8;
418  unsigned int i = 0;
419 
420  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
421  fones, fzeroes;
422  __m256 sine, cosine;
423  __m256i q, ones, twos, fours;
424 
425  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
426  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
427  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
428  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
429  ffours = _mm256_set1_ps(4.0);
430  ftwos = _mm256_set1_ps(2.0);
431  fones = _mm256_set1_ps(1.0);
432  fzeroes = _mm256_setzero_ps();
433  __m256i zeroes = _mm256_set1_epi32(0);
434  ones = _mm256_set1_epi32(1);
435  __m256i allones = _mm256_set1_epi32(0xffffffff);
436  twos = _mm256_set1_epi32(2);
437  fours = _mm256_set1_epi32(4);
438 
439  cp1 = _mm256_set1_ps(1.0);
440  cp2 = _mm256_set1_ps(0.08333333333333333);
441  cp3 = _mm256_set1_ps(0.002777777777777778);
442  cp4 = _mm256_set1_ps(4.96031746031746e-05);
443  cp5 = _mm256_set1_ps(5.511463844797178e-07);
444  union bit256 condition1;
445  union bit256 condition3;
446 
447  for (; number < eighthPoints; number++) {
448 
449  aVal = _mm256_loadu_ps(aPtr);
450  // s = fabs(aVal)
451  s = _mm256_sub_ps(aVal,
452  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
453  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
454  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
455  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
456  // r = q + q&1, q indicates quadrant, r gives
457  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
458 
459  s = _mm256_fnmadd_ps(r, pio4A, s);
460  s = _mm256_fnmadd_ps(r, pio4B, s);
461  s = _mm256_fnmadd_ps(r, pio4C, s);
462 
463  s = _mm256_div_ps(
464  s,
465  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
466  s = _mm256_mul_ps(s, s);
467  // Evaluate Taylor series
468  s = _mm256_mul_ps(
469  _mm256_fmadd_ps(
470  _mm256_fmsub_ps(
471  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
472  s,
473  cp1),
474  s);
475 
476  for (i = 0; i < 3; i++)
477  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
478  s = _mm256_div_ps(s, ftwos);
479 
480  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
481  cosine = _mm256_sub_ps(fones, s);
482 
483  // if(((q+1)&2) != 0) { cosine=sine;}
484  condition1.int_vec =
485  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
486  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
487 
488  // if(((q+2)&4) != 0) { cosine = -cosine;}
489  condition3.int_vec = _mm256_cmpeq_epi32(
490  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
491  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
492 
493  cosine = _mm256_add_ps(
494  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
495  cosine = _mm256_sub_ps(cosine,
496  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
497  condition3.float_vec));
498  _mm256_storeu_ps(bPtr, cosine);
499  aPtr += 8;
500  bPtr += 8;
501  }
502 
503  number = eighthPoints * 8;
504  for (; number < num_points; number++) {
505  *bPtr++ = cos(*aPtr++);
506  }
507 }
508 
509 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
510 
511 #ifdef LV_HAVE_AVX2
512 #include <immintrin.h>
513 
514 static inline void
515 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
516 {
517  float* bPtr = bVector;
518  const float* aPtr = aVector;
519 
520  unsigned int number = 0;
521  unsigned int eighthPoints = num_points / 8;
522  unsigned int i = 0;
523 
524  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
525  fones, fzeroes;
526  __m256 sine, cosine;
527  __m256i q, ones, twos, fours;
528 
529  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
530  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
531  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
532  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
533  ffours = _mm256_set1_ps(4.0);
534  ftwos = _mm256_set1_ps(2.0);
535  fones = _mm256_set1_ps(1.0);
536  fzeroes = _mm256_setzero_ps();
537  __m256i zeroes = _mm256_set1_epi32(0);
538  ones = _mm256_set1_epi32(1);
539  __m256i allones = _mm256_set1_epi32(0xffffffff);
540  twos = _mm256_set1_epi32(2);
541  fours = _mm256_set1_epi32(4);
542 
543  cp1 = _mm256_set1_ps(1.0);
544  cp2 = _mm256_set1_ps(0.08333333333333333);
545  cp3 = _mm256_set1_ps(0.002777777777777778);
546  cp4 = _mm256_set1_ps(4.96031746031746e-05);
547  cp5 = _mm256_set1_ps(5.511463844797178e-07);
548  union bit256 condition1;
549  union bit256 condition3;
550 
551  for (; number < eighthPoints; number++) {
552 
553  aVal = _mm256_loadu_ps(aPtr);
554  // s = fabs(aVal)
555  s = _mm256_sub_ps(aVal,
556  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
557  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
558  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
559  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
560  // r = q + q&1, q indicates quadrant, r gives
561  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
562 
563  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
564  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
565  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
566 
567  s = _mm256_div_ps(
568  s,
569  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
570  s = _mm256_mul_ps(s, s);
571  // Evaluate Taylor series
572  s = _mm256_mul_ps(
573  _mm256_add_ps(
574  _mm256_mul_ps(
575  _mm256_sub_ps(
576  _mm256_mul_ps(
577  _mm256_add_ps(
578  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
579  s),
580  cp3),
581  s),
582  cp2),
583  s),
584  cp1),
585  s);
586 
587  for (i = 0; i < 3; i++)
588  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
589  s = _mm256_div_ps(s, ftwos);
590 
591  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
592  cosine = _mm256_sub_ps(fones, s);
593 
594  // if(((q+1)&2) != 0) { cosine=sine;}
595  condition1.int_vec =
596  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
597  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
598 
599  // if(((q+2)&4) != 0) { cosine = -cosine;}
600  condition3.int_vec = _mm256_cmpeq_epi32(
601  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
602  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
603 
604  cosine = _mm256_add_ps(
605  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
606  cosine = _mm256_sub_ps(cosine,
607  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
608  condition3.float_vec));
609  _mm256_storeu_ps(bPtr, cosine);
610  aPtr += 8;
611  bPtr += 8;
612  }
613 
614  number = eighthPoints * 8;
615  for (; number < num_points; number++) {
616  *bPtr++ = cos(*aPtr++);
617  }
618 }
619 
620 #endif /* LV_HAVE_AVX2 for unaligned */
621 
622 #ifdef LV_HAVE_SSE4_1
623 #include <smmintrin.h>
624 
625 static inline void
626 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
627 {
628  float* bPtr = bVector;
629  const float* aPtr = aVector;
630 
631  unsigned int number = 0;
632  unsigned int quarterPoints = num_points / 4;
633  unsigned int i = 0;
634 
635  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
636  fzeroes;
637  __m128 sine, cosine, condition1, condition3;
638  __m128i q, r, ones, twos, fours;
639 
640  m4pi = _mm_set1_ps(1.273239545);
641  pio4A = _mm_set1_ps(0.78515625);
642  pio4B = _mm_set1_ps(0.241876e-3);
643  ffours = _mm_set1_ps(4.0);
644  ftwos = _mm_set1_ps(2.0);
645  fones = _mm_set1_ps(1.0);
646  fzeroes = _mm_setzero_ps();
647  ones = _mm_set1_epi32(1);
648  twos = _mm_set1_epi32(2);
649  fours = _mm_set1_epi32(4);
650 
651  cp1 = _mm_set1_ps(1.0);
652  cp2 = _mm_set1_ps(0.83333333e-1);
653  cp3 = _mm_set1_ps(0.2777778e-2);
654  cp4 = _mm_set1_ps(0.49603e-4);
655  cp5 = _mm_set1_ps(0.551e-6);
656 
657  for (; number < quarterPoints; number++) {
658  aVal = _mm_loadu_ps(aPtr);
659  s = _mm_sub_ps(aVal,
660  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
661  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
662  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
663 
664  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
665  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
666 
667  s = _mm_div_ps(
668  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
669  s = _mm_mul_ps(s, s);
670  // Evaluate Taylor series
671  s = _mm_mul_ps(
672  _mm_add_ps(
673  _mm_mul_ps(
674  _mm_sub_ps(
675  _mm_mul_ps(
676  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
677  cp3),
678  s),
679  cp2),
680  s),
681  cp1),
682  s);
683 
684  for (i = 0; i < 3; i++) {
685  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
686  }
687  s = _mm_div_ps(s, ftwos);
688 
689  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
690  cosine = _mm_sub_ps(fones, s);
691 
692  condition1 = _mm_cmpneq_ps(
693  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
694 
695  condition3 = _mm_cmpneq_ps(
696  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
697 
698  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
699  cosine = _mm_sub_ps(
700  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
701  _mm_storeu_ps(bPtr, cosine);
702  aPtr += 4;
703  bPtr += 4;
704  }
705 
706  number = quarterPoints * 4;
707  for (; number < num_points; number++) {
708  *bPtr++ = cosf(*aPtr++);
709  }
710 }
711 
712 #endif /* LV_HAVE_SSE4_1 for unaligned */
713 
714 
715 #ifdef LV_HAVE_GENERIC
716 
717 /*
718  * For derivation see
719  * Shibata, Naoki, "Efficient evaluation methods of elementary functions
720  * suitable for SIMD computation," in Springer-Verlag 2010
721  */
722 static inline void volk_32f_cos_32f_generic_fast(float* bVector,
723  const float* aVector,
724  unsigned int num_points)
725 {
726  float* bPtr = bVector;
727  const float* aPtr = aVector;
728 
729  float m4pi = 1.273239544735162542821171882678754627704620361328125;
730  float pio4A = 0.7853981554508209228515625;
731  float pio4B = 0.794662735614792836713604629039764404296875e-8;
732  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
733  int N = 3; // order of argument reduction
734 
735  unsigned int number;
736  for (number = 0; number < num_points; number++) {
737  float s = fabs(*aPtr);
738  int q = (int)(s * m4pi);
739  int r = q + (q & 1);
740  s -= r * pio4A;
741  s -= r * pio4B;
742  s -= r * pio4C;
743 
744  s = s * 0.125; // 2^-N (<--3)
745  s = s * s;
746  s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
747  1.0) *
748  s;
749 
750  int i;
751  for (i = 0; i < N; ++i) {
752  s = (4.0 - s) * s;
753  }
754  s = s / 2.0;
755 
756  float sine = sqrt((2.0 - s) * s);
757  float cosine = 1 - s;
758 
759  if (((q + 1) & 2) != 0) {
760  s = cosine;
761  cosine = sine;
762  sine = s;
763  }
764  if (((q + 2) & 4) != 0) {
765  cosine = -cosine;
766  }
767  *bPtr = cosine;
768  bPtr++;
769  aPtr++;
770  }
771 }
772 
773 #endif /* LV_HAVE_GENERIC */
774 
775 
776 #ifdef LV_HAVE_GENERIC
777 
778 static inline void
779 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
780 {
781  float* bPtr = bVector;
782  const float* aPtr = aVector;
783  unsigned int number = 0;
784 
785  for (; number < num_points; number++) {
786  *bPtr++ = cosf(*aPtr++);
787  }
788 }
789 
790 #endif /* LV_HAVE_GENERIC */
791 
792 
793 #ifdef LV_HAVE_NEON
794 #include <arm_neon.h>
796 
797 static inline void
798 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
799 {
800  unsigned int number = 0;
801  unsigned int quarter_points = num_points / 4;
802  float* bVectorPtr = bVector;
803  const float* aVectorPtr = aVector;
804 
805  float32x4_t b_vec;
806  float32x4_t a_vec;
807 
808  for (number = 0; number < quarter_points; number++) {
809  a_vec = vld1q_f32(aVectorPtr);
810  // Prefetch next one, speeds things up
811  __VOLK_PREFETCH(aVectorPtr + 4);
812  b_vec = _vcosq_f32(a_vec);
813  vst1q_f32(bVectorPtr, b_vec);
814  // move pointers ahead
815  bVectorPtr += 4;
816  aVectorPtr += 4;
817  }
818 
819  // Deal with the rest
820  for (number = quarter_points * 4; number < num_points; number++) {
821  *bVectorPtr++ = cosf(*aVectorPtr++);
822  }
823 }
824 
825 #endif /* LV_HAVE_NEON */
826 
827 
828 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
Definition: volk_common.h:111
Definition: volk_common.h:128
__m256 float_vec
Definition: volk_common.h:136
__m256i int_vec
Definition: volk_common.h:137
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:779
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:798
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:722
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:268