Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_tan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #include <inttypes.h>
75 #include <math.h>
76 #include <stdio.h>
77 
78 #ifndef INCLUDED_volk_32f_tan_32f_a_H
79 #define INCLUDED_volk_32f_tan_32f_a_H
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 static inline void
85 volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
86 {
87  float* bPtr = bVector;
88  const float* aPtr = aVector;
89 
90  unsigned int number = 0;
91  unsigned int eighthPoints = num_points / 8;
92  unsigned int i = 0;
93 
94  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
95  fzeroes;
96  __m256 sine, cosine, tangent, condition1, condition2, condition3;
97  __m256i q, r, ones, twos, fours;
98 
99  m4pi = _mm256_set1_ps(1.273239545);
100  pio4A = _mm256_set1_ps(0.78515625);
101  pio4B = _mm256_set1_ps(0.241876e-3);
102  ffours = _mm256_set1_ps(4.0);
103  ftwos = _mm256_set1_ps(2.0);
104  fones = _mm256_set1_ps(1.0);
105  fzeroes = _mm256_setzero_ps();
106  ones = _mm256_set1_epi32(1);
107  twos = _mm256_set1_epi32(2);
108  fours = _mm256_set1_epi32(4);
109 
110  cp1 = _mm256_set1_ps(1.0);
111  cp2 = _mm256_set1_ps(0.83333333e-1);
112  cp3 = _mm256_set1_ps(0.2777778e-2);
113  cp4 = _mm256_set1_ps(0.49603e-4);
114  cp5 = _mm256_set1_ps(0.551e-6);
115 
116  for (; number < eighthPoints; number++) {
117  aVal = _mm256_load_ps(aPtr);
118  s = _mm256_sub_ps(aVal,
119  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
120  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
121  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
122  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
123 
124  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
125  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
126 
127  s = _mm256_div_ps(
128  s,
129  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
130  s = _mm256_mul_ps(s, s);
131  // Evaluate Taylor series
132  s = _mm256_mul_ps(
133  _mm256_fmadd_ps(
134  _mm256_fmsub_ps(
135  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
136  s,
137  cp1),
138  s);
139 
140  for (i = 0; i < 3; i++) {
141  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
142  }
143  s = _mm256_div_ps(s, ftwos);
144 
145  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
146  cosine = _mm256_sub_ps(fones, s);
147 
148  condition1 = _mm256_cmp_ps(
149  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
150  fzeroes,
151  _CMP_NEQ_UQ);
152  condition2 = _mm256_cmp_ps(
153  _mm256_cmp_ps(
154  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
155  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
156  _CMP_NEQ_UQ);
157  condition3 = _mm256_cmp_ps(
158  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
159  fzeroes,
160  _CMP_NEQ_UQ);
161 
162  __m256 temp = cosine;
163  cosine =
164  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
165  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
166  sine = _mm256_sub_ps(
167  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
168  cosine = _mm256_sub_ps(
169  cosine,
170  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
171  tangent = _mm256_div_ps(sine, cosine);
172  _mm256_store_ps(bPtr, tangent);
173  aPtr += 8;
174  bPtr += 8;
175  }
176 
177  number = eighthPoints * 8;
178  for (; number < num_points; number++) {
179  *bPtr++ = tan(*aPtr++);
180  }
181 }
182 
183 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
184 
185 #ifdef LV_HAVE_AVX2
186 #include <immintrin.h>
187 
188 static inline void
189 volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
190 {
191  float* bPtr = bVector;
192  const float* aPtr = aVector;
193 
194  unsigned int number = 0;
195  unsigned int eighthPoints = num_points / 8;
196  unsigned int i = 0;
197 
198  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
199  fzeroes;
200  __m256 sine, cosine, tangent, condition1, condition2, condition3;
201  __m256i q, r, ones, twos, fours;
202 
203  m4pi = _mm256_set1_ps(1.273239545);
204  pio4A = _mm256_set1_ps(0.78515625);
205  pio4B = _mm256_set1_ps(0.241876e-3);
206  ffours = _mm256_set1_ps(4.0);
207  ftwos = _mm256_set1_ps(2.0);
208  fones = _mm256_set1_ps(1.0);
209  fzeroes = _mm256_setzero_ps();
210  ones = _mm256_set1_epi32(1);
211  twos = _mm256_set1_epi32(2);
212  fours = _mm256_set1_epi32(4);
213 
214  cp1 = _mm256_set1_ps(1.0);
215  cp2 = _mm256_set1_ps(0.83333333e-1);
216  cp3 = _mm256_set1_ps(0.2777778e-2);
217  cp4 = _mm256_set1_ps(0.49603e-4);
218  cp5 = _mm256_set1_ps(0.551e-6);
219 
220  for (; number < eighthPoints; number++) {
221  aVal = _mm256_load_ps(aPtr);
222  s = _mm256_sub_ps(aVal,
223  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
224  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
225  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
226  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
227 
228  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
229  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
230 
231  s = _mm256_div_ps(
232  s,
233  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
234  s = _mm256_mul_ps(s, s);
235  // Evaluate Taylor series
236  s = _mm256_mul_ps(
237  _mm256_add_ps(
238  _mm256_mul_ps(
239  _mm256_sub_ps(
240  _mm256_mul_ps(
241  _mm256_add_ps(
242  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
243  s),
244  cp3),
245  s),
246  cp2),
247  s),
248  cp1),
249  s);
250 
251  for (i = 0; i < 3; i++) {
252  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
253  }
254  s = _mm256_div_ps(s, ftwos);
255 
256  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
257  cosine = _mm256_sub_ps(fones, s);
258 
259  condition1 = _mm256_cmp_ps(
260  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
261  fzeroes,
262  _CMP_NEQ_UQ);
263  condition2 = _mm256_cmp_ps(
264  _mm256_cmp_ps(
265  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
266  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
267  _CMP_NEQ_UQ);
268  condition3 = _mm256_cmp_ps(
269  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
270  fzeroes,
271  _CMP_NEQ_UQ);
272 
273  __m256 temp = cosine;
274  cosine =
275  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
276  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
277  sine = _mm256_sub_ps(
278  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
279  cosine = _mm256_sub_ps(
280  cosine,
281  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
282  tangent = _mm256_div_ps(sine, cosine);
283  _mm256_store_ps(bPtr, tangent);
284  aPtr += 8;
285  bPtr += 8;
286  }
287 
288  number = eighthPoints * 8;
289  for (; number < num_points; number++) {
290  *bPtr++ = tan(*aPtr++);
291  }
292 }
293 
294 #endif /* LV_HAVE_AVX2 for aligned */
295 
296 #ifdef LV_HAVE_SSE4_1
297 #include <smmintrin.h>
298 
299 static inline void
300 volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
301 {
302  float* bPtr = bVector;
303  const float* aPtr = aVector;
304 
305  unsigned int number = 0;
306  unsigned int quarterPoints = num_points / 4;
307  unsigned int i = 0;
308 
309  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
310  fzeroes;
311  __m128 sine, cosine, tangent, condition1, condition2, condition3;
312  __m128i q, r, ones, twos, fours;
313 
314  m4pi = _mm_set1_ps(1.273239545);
315  pio4A = _mm_set1_ps(0.78515625);
316  pio4B = _mm_set1_ps(0.241876e-3);
317  ffours = _mm_set1_ps(4.0);
318  ftwos = _mm_set1_ps(2.0);
319  fones = _mm_set1_ps(1.0);
320  fzeroes = _mm_setzero_ps();
321  ones = _mm_set1_epi32(1);
322  twos = _mm_set1_epi32(2);
323  fours = _mm_set1_epi32(4);
324 
325  cp1 = _mm_set1_ps(1.0);
326  cp2 = _mm_set1_ps(0.83333333e-1);
327  cp3 = _mm_set1_ps(0.2777778e-2);
328  cp4 = _mm_set1_ps(0.49603e-4);
329  cp5 = _mm_set1_ps(0.551e-6);
330 
331  for (; number < quarterPoints; number++) {
332  aVal = _mm_load_ps(aPtr);
333  s = _mm_sub_ps(aVal,
334  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
335  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
336  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
337 
338  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
339  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
340 
341  s = _mm_div_ps(
342  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
343  s = _mm_mul_ps(s, s);
344  // Evaluate Taylor series
345  s = _mm_mul_ps(
346  _mm_add_ps(
347  _mm_mul_ps(
348  _mm_sub_ps(
349  _mm_mul_ps(
350  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
351  cp3),
352  s),
353  cp2),
354  s),
355  cp1),
356  s);
357 
358  for (i = 0; i < 3; i++) {
359  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
360  }
361  s = _mm_div_ps(s, ftwos);
362 
363  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
364  cosine = _mm_sub_ps(fones, s);
365 
366  condition1 = _mm_cmpneq_ps(
367  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
368  condition2 = _mm_cmpneq_ps(
369  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
370  _mm_cmplt_ps(aVal, fzeroes));
371  condition3 = _mm_cmpneq_ps(
372  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
373 
374  __m128 temp = cosine;
375  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
376  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
377  sine =
378  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
379  cosine = _mm_sub_ps(
380  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
381  tangent = _mm_div_ps(sine, cosine);
382  _mm_store_ps(bPtr, tangent);
383  aPtr += 4;
384  bPtr += 4;
385  }
386 
387  number = quarterPoints * 4;
388  for (; number < num_points; number++) {
389  *bPtr++ = tanf(*aPtr++);
390  }
391 }
392 
393 #endif /* LV_HAVE_SSE4_1 for aligned */
394 
395 
396 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
397 
398 #ifndef INCLUDED_volk_32f_tan_32f_u_H
399 #define INCLUDED_volk_32f_tan_32f_u_H
400 
401 #if LV_HAVE_AVX2 && LV_HAVE_FMA
402 #include <immintrin.h>
403 
404 static inline void
405 volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
406 {
407  float* bPtr = bVector;
408  const float* aPtr = aVector;
409 
410  unsigned int number = 0;
411  unsigned int eighthPoints = num_points / 8;
412  unsigned int i = 0;
413 
414  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
415  fzeroes;
416  __m256 sine, cosine, tangent, condition1, condition2, condition3;
417  __m256i q, r, ones, twos, fours;
418 
419  m4pi = _mm256_set1_ps(1.273239545);
420  pio4A = _mm256_set1_ps(0.78515625);
421  pio4B = _mm256_set1_ps(0.241876e-3);
422  ffours = _mm256_set1_ps(4.0);
423  ftwos = _mm256_set1_ps(2.0);
424  fones = _mm256_set1_ps(1.0);
425  fzeroes = _mm256_setzero_ps();
426  ones = _mm256_set1_epi32(1);
427  twos = _mm256_set1_epi32(2);
428  fours = _mm256_set1_epi32(4);
429 
430  cp1 = _mm256_set1_ps(1.0);
431  cp2 = _mm256_set1_ps(0.83333333e-1);
432  cp3 = _mm256_set1_ps(0.2777778e-2);
433  cp4 = _mm256_set1_ps(0.49603e-4);
434  cp5 = _mm256_set1_ps(0.551e-6);
435 
436  for (; number < eighthPoints; number++) {
437  aVal = _mm256_loadu_ps(aPtr);
438  s = _mm256_sub_ps(aVal,
439  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
440  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
441  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
442  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
443 
444  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
445  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
446 
447  s = _mm256_div_ps(
448  s,
449  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
450  s = _mm256_mul_ps(s, s);
451  // Evaluate Taylor series
452  s = _mm256_mul_ps(
453  _mm256_fmadd_ps(
454  _mm256_fmsub_ps(
455  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
456  s,
457  cp1),
458  s);
459 
460  for (i = 0; i < 3; i++) {
461  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
462  }
463  s = _mm256_div_ps(s, ftwos);
464 
465  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
466  cosine = _mm256_sub_ps(fones, s);
467 
468  condition1 = _mm256_cmp_ps(
469  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
470  fzeroes,
471  _CMP_NEQ_UQ);
472  condition2 = _mm256_cmp_ps(
473  _mm256_cmp_ps(
474  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
475  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
476  _CMP_NEQ_UQ);
477  condition3 = _mm256_cmp_ps(
478  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
479  fzeroes,
480  _CMP_NEQ_UQ);
481 
482  __m256 temp = cosine;
483  cosine =
484  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
485  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
486  sine = _mm256_sub_ps(
487  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
488  cosine = _mm256_sub_ps(
489  cosine,
490  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
491  tangent = _mm256_div_ps(sine, cosine);
492  _mm256_storeu_ps(bPtr, tangent);
493  aPtr += 8;
494  bPtr += 8;
495  }
496 
497  number = eighthPoints * 8;
498  for (; number < num_points; number++) {
499  *bPtr++ = tan(*aPtr++);
500  }
501 }
502 
503 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
504 
505 #ifdef LV_HAVE_AVX2
506 #include <immintrin.h>
507 
508 static inline void
509 volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
510 {
511  float* bPtr = bVector;
512  const float* aPtr = aVector;
513 
514  unsigned int number = 0;
515  unsigned int eighthPoints = num_points / 8;
516  unsigned int i = 0;
517 
518  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
519  fzeroes;
520  __m256 sine, cosine, tangent, condition1, condition2, condition3;
521  __m256i q, r, ones, twos, fours;
522 
523  m4pi = _mm256_set1_ps(1.273239545);
524  pio4A = _mm256_set1_ps(0.78515625);
525  pio4B = _mm256_set1_ps(0.241876e-3);
526  ffours = _mm256_set1_ps(4.0);
527  ftwos = _mm256_set1_ps(2.0);
528  fones = _mm256_set1_ps(1.0);
529  fzeroes = _mm256_setzero_ps();
530  ones = _mm256_set1_epi32(1);
531  twos = _mm256_set1_epi32(2);
532  fours = _mm256_set1_epi32(4);
533 
534  cp1 = _mm256_set1_ps(1.0);
535  cp2 = _mm256_set1_ps(0.83333333e-1);
536  cp3 = _mm256_set1_ps(0.2777778e-2);
537  cp4 = _mm256_set1_ps(0.49603e-4);
538  cp5 = _mm256_set1_ps(0.551e-6);
539 
540  for (; number < eighthPoints; number++) {
541  aVal = _mm256_loadu_ps(aPtr);
542  s = _mm256_sub_ps(aVal,
543  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
544  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
545  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
546  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
547 
548  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
549  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
550 
551  s = _mm256_div_ps(
552  s,
553  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
554  s = _mm256_mul_ps(s, s);
555  // Evaluate Taylor series
556  s = _mm256_mul_ps(
557  _mm256_add_ps(
558  _mm256_mul_ps(
559  _mm256_sub_ps(
560  _mm256_mul_ps(
561  _mm256_add_ps(
562  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
563  s),
564  cp3),
565  s),
566  cp2),
567  s),
568  cp1),
569  s);
570 
571  for (i = 0; i < 3; i++) {
572  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
573  }
574  s = _mm256_div_ps(s, ftwos);
575 
576  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
577  cosine = _mm256_sub_ps(fones, s);
578 
579  condition1 = _mm256_cmp_ps(
580  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
581  fzeroes,
582  _CMP_NEQ_UQ);
583  condition2 = _mm256_cmp_ps(
584  _mm256_cmp_ps(
585  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
586  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
587  _CMP_NEQ_UQ);
588  condition3 = _mm256_cmp_ps(
589  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
590  fzeroes,
591  _CMP_NEQ_UQ);
592 
593  __m256 temp = cosine;
594  cosine =
595  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
596  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
597  sine = _mm256_sub_ps(
598  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
599  cosine = _mm256_sub_ps(
600  cosine,
601  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
602  tangent = _mm256_div_ps(sine, cosine);
603  _mm256_storeu_ps(bPtr, tangent);
604  aPtr += 8;
605  bPtr += 8;
606  }
607 
608  number = eighthPoints * 8;
609  for (; number < num_points; number++) {
610  *bPtr++ = tan(*aPtr++);
611  }
612 }
613 
614 #endif /* LV_HAVE_AVX2 for unaligned */
615 
616 
617 #ifdef LV_HAVE_SSE4_1
618 #include <smmintrin.h>
619 
620 static inline void
621 volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
622 {
623  float* bPtr = bVector;
624  const float* aPtr = aVector;
625 
626  unsigned int number = 0;
627  unsigned int quarterPoints = num_points / 4;
628  unsigned int i = 0;
629 
630  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
631  fzeroes;
632  __m128 sine, cosine, tangent, condition1, condition2, condition3;
633  __m128i q, r, ones, twos, fours;
634 
635  m4pi = _mm_set1_ps(1.273239545);
636  pio4A = _mm_set1_ps(0.78515625);
637  pio4B = _mm_set1_ps(0.241876e-3);
638  ffours = _mm_set1_ps(4.0);
639  ftwos = _mm_set1_ps(2.0);
640  fones = _mm_set1_ps(1.0);
641  fzeroes = _mm_setzero_ps();
642  ones = _mm_set1_epi32(1);
643  twos = _mm_set1_epi32(2);
644  fours = _mm_set1_epi32(4);
645 
646  cp1 = _mm_set1_ps(1.0);
647  cp2 = _mm_set1_ps(0.83333333e-1);
648  cp3 = _mm_set1_ps(0.2777778e-2);
649  cp4 = _mm_set1_ps(0.49603e-4);
650  cp5 = _mm_set1_ps(0.551e-6);
651 
652  for (; number < quarterPoints; number++) {
653  aVal = _mm_loadu_ps(aPtr);
654  s = _mm_sub_ps(aVal,
655  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
656  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
657  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
658 
659  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
660  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
661 
662  s = _mm_div_ps(
663  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
664  s = _mm_mul_ps(s, s);
665  // Evaluate Taylor series
666  s = _mm_mul_ps(
667  _mm_add_ps(
668  _mm_mul_ps(
669  _mm_sub_ps(
670  _mm_mul_ps(
671  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
672  cp3),
673  s),
674  cp2),
675  s),
676  cp1),
677  s);
678 
679  for (i = 0; i < 3; i++) {
680  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
681  }
682  s = _mm_div_ps(s, ftwos);
683 
684  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
685  cosine = _mm_sub_ps(fones, s);
686 
687  condition1 = _mm_cmpneq_ps(
688  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
689  condition2 = _mm_cmpneq_ps(
690  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
691  _mm_cmplt_ps(aVal, fzeroes));
692  condition3 = _mm_cmpneq_ps(
693  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
694 
695  __m128 temp = cosine;
696  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
697  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
698  sine =
699  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
700  cosine = _mm_sub_ps(
701  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
702  tangent = _mm_div_ps(sine, cosine);
703  _mm_storeu_ps(bPtr, tangent);
704  aPtr += 4;
705  bPtr += 4;
706  }
707 
708  number = quarterPoints * 4;
709  for (; number < num_points; number++) {
710  *bPtr++ = tanf(*aPtr++);
711  }
712 }
713 
714 #endif /* LV_HAVE_SSE4_1 for unaligned */
715 
716 
717 #ifdef LV_HAVE_GENERIC
718 
719 static inline void
720 volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
721 {
722  float* bPtr = bVector;
723  const float* aPtr = aVector;
724  unsigned int number = 0;
725 
726  for (; number < num_points; number++) {
727  *bPtr++ = tanf(*aPtr++);
728  }
729 }
730 #endif /* LV_HAVE_GENERIC */
731 
732 
733 #ifdef LV_HAVE_NEON
734 #include <arm_neon.h>
736 
737 static inline void
738 volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
739 {
740  unsigned int number = 0;
741  unsigned int quarter_points = num_points / 4;
742  float* bVectorPtr = bVector;
743  const float* aVectorPtr = aVector;
744 
745  float32x4_t b_vec;
746  float32x4_t a_vec;
747 
748  for (number = 0; number < quarter_points; number++) {
749  a_vec = vld1q_f32(aVectorPtr);
750  // Prefetch next one, speeds things up
751  __VOLK_PREFETCH(aVectorPtr + 4);
752  b_vec = _vtanq_f32(a_vec);
753  vst1q_f32(bVectorPtr, b_vec);
754  // move pointers ahead
755  bVectorPtr += 4;
756  aVectorPtr += 4;
757  }
758 
759  // Deal with the rest
760  for (number = quarter_points * 4; number < num_points; number++) {
761  *bVectorPtr++ = tanf(*aVectorPtr++);
762  }
763 }
764 #endif /* LV_HAVE_NEON */
765 
766 
767 #endif /* INCLUDED_volk_32f_tan_32f_u_H */
static void volk_32f_tan_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:738
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:720
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:274