78 #ifndef INCLUDED_volk_32f_asin_32f_a_H
79 #define INCLUDED_volk_32f_asin_32f_a_H
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
84 static inline void volk_32f_asin_32f_a_avx2_fma(
float* bVector,
86 unsigned int num_points)
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
95 __m256 aVal, pio2, x, y, z, arcsine;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
98 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99 fzeroes = _mm256_setzero_ps();
100 fones = _mm256_set1_ps(1.0);
101 ftwos = _mm256_set1_ps(2.0);
102 ffours = _mm256_set1_ps(4.0);
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106 aVal = _mm256_div_ps(aVal,
107 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
108 _mm256_sub_ps(fones, aVal))));
110 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
111 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
112 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
114 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
116 for (
i = 0;
i < 2;
i++) {
117 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
119 x = _mm256_div_ps(fones, x);
123 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
126 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
129 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
131 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132 arcsine = _mm256_sub_ps(arcsine,
133 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
135 _mm256_store_ps(bPtr, arcsine);
140 number = eighthPoints * 8;
141 for (; number < num_points; number++) {
142 *bPtr++ = asin(*aPtr++);
150 #include <immintrin.h>
155 float* bPtr = bVector;
156 const float* aPtr = aVector;
158 unsigned int number = 0;
159 unsigned int eighthPoints = num_points / 8;
162 __m256 aVal, pio2, x, y, z, arcsine;
163 __m256 fzeroes, fones, ftwos, ffours, condition;
165 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
166 fzeroes = _mm256_setzero_ps();
167 fones = _mm256_set1_ps(1.0);
168 ftwos = _mm256_set1_ps(2.0);
169 ffours = _mm256_set1_ps(4.0);
171 for (; number < eighthPoints; number++) {
172 aVal = _mm256_load_ps(aPtr);
173 aVal = _mm256_div_ps(aVal,
174 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
175 _mm256_sub_ps(fones, aVal))));
177 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
178 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
179 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
181 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
183 for (
i = 0;
i < 2;
i++) {
185 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
187 x = _mm256_div_ps(fones, x);
190 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
191 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
194 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
195 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
198 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
200 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
201 arcsine = _mm256_sub_ps(arcsine,
202 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
204 _mm256_store_ps(bPtr, arcsine);
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *bPtr++ = asin(*aPtr++);
217 #ifdef LV_HAVE_SSE4_1
218 #include <smmintrin.h>
221 volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
223 float* bPtr = bVector;
224 const float* aPtr = aVector;
226 unsigned int number = 0;
227 unsigned int quarterPoints = num_points / 4;
230 __m128 aVal, pio2, x, y, z, arcsine;
231 __m128 fzeroes, fones, ftwos, ffours, condition;
233 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
234 fzeroes = _mm_setzero_ps();
235 fones = _mm_set1_ps(1.0);
236 ftwos = _mm_set1_ps(2.0);
237 ffours = _mm_set1_ps(4.0);
239 for (; number < quarterPoints; number++) {
240 aVal = _mm_load_ps(aPtr);
243 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
245 condition = _mm_cmplt_ps(z, fzeroes);
246 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
247 condition = _mm_cmplt_ps(z, fones);
248 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
250 for (
i = 0;
i < 2;
i++) {
251 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
253 x = _mm_div_ps(fones, x);
256 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
257 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
260 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
261 condition = _mm_cmpgt_ps(z, fones);
263 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
265 condition = _mm_cmplt_ps(aVal, fzeroes);
266 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
268 _mm_store_ps(bPtr, arcsine);
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = asinf(*aPtr++);
283 #ifndef INCLUDED_volk_32f_asin_32f_u_H
284 #define INCLUDED_volk_32f_asin_32f_u_H
286 #if LV_HAVE_AVX2 && LV_HAVE_FMA
287 #include <immintrin.h>
289 static inline void volk_32f_asin_32f_u_avx2_fma(
float* bVector,
290 const float* aVector,
291 unsigned int num_points)
293 float* bPtr = bVector;
294 const float* aPtr = aVector;
296 unsigned int number = 0;
297 unsigned int eighthPoints = num_points / 8;
300 __m256 aVal, pio2, x, y, z, arcsine;
301 __m256 fzeroes, fones, ftwos, ffours, condition;
303 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
304 fzeroes = _mm256_setzero_ps();
305 fones = _mm256_set1_ps(1.0);
306 ftwos = _mm256_set1_ps(2.0);
307 ffours = _mm256_set1_ps(4.0);
309 for (; number < eighthPoints; number++) {
310 aVal = _mm256_loadu_ps(aPtr);
311 aVal = _mm256_div_ps(aVal,
312 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
313 _mm256_sub_ps(fones, aVal))));
315 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
316 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
317 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
319 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
321 for (
i = 0;
i < 2;
i++) {
322 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
324 x = _mm256_div_ps(fones, x);
328 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
331 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
332 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
334 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
336 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
337 arcsine = _mm256_sub_ps(arcsine,
338 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
340 _mm256_storeu_ps(bPtr, arcsine);
345 number = eighthPoints * 8;
346 for (; number < num_points; number++) {
347 *bPtr++ = asin(*aPtr++);
355 #include <immintrin.h>
360 float* bPtr = bVector;
361 const float* aPtr = aVector;
363 unsigned int number = 0;
364 unsigned int eighthPoints = num_points / 8;
367 __m256 aVal, pio2, x, y, z, arcsine;
368 __m256 fzeroes, fones, ftwos, ffours, condition;
370 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
371 fzeroes = _mm256_setzero_ps();
372 fones = _mm256_set1_ps(1.0);
373 ftwos = _mm256_set1_ps(2.0);
374 ffours = _mm256_set1_ps(4.0);
376 for (; number < eighthPoints; number++) {
377 aVal = _mm256_loadu_ps(aPtr);
378 aVal = _mm256_div_ps(aVal,
379 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
380 _mm256_sub_ps(fones, aVal))));
382 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
383 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
384 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
386 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
388 for (
i = 0;
i < 2;
i++) {
390 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
392 x = _mm256_div_ps(fones, x);
395 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
396 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
399 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
400 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
403 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
405 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
406 arcsine = _mm256_sub_ps(arcsine,
407 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
409 _mm256_storeu_ps(bPtr, arcsine);
414 number = eighthPoints * 8;
415 for (; number < num_points; number++) {
416 *bPtr++ = asin(*aPtr++);
423 #ifdef LV_HAVE_SSE4_1
424 #include <smmintrin.h>
427 volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
429 float* bPtr = bVector;
430 const float* aPtr = aVector;
432 unsigned int number = 0;
433 unsigned int quarterPoints = num_points / 4;
436 __m128 aVal, pio2, x, y, z, arcsine;
437 __m128 fzeroes, fones, ftwos, ffours, condition;
439 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
440 fzeroes = _mm_setzero_ps();
441 fones = _mm_set1_ps(1.0);
442 ftwos = _mm_set1_ps(2.0);
443 ffours = _mm_set1_ps(4.0);
445 for (; number < quarterPoints; number++) {
446 aVal = _mm_loadu_ps(aPtr);
449 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
451 condition = _mm_cmplt_ps(z, fzeroes);
452 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453 condition = _mm_cmplt_ps(z, fones);
454 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
456 for (
i = 0;
i < 2;
i++) {
457 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
459 x = _mm_div_ps(fones, x);
462 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
466 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
467 condition = _mm_cmpgt_ps(z, fones);
469 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
471 condition = _mm_cmplt_ps(aVal, fzeroes);
472 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
474 _mm_storeu_ps(bPtr, arcsine);
479 number = quarterPoints * 4;
480 for (; number < num_points; number++) {
481 *bPtr++ = asinf(*aPtr++);
487 #ifdef LV_HAVE_GENERIC
492 float* bPtr = bVector;
493 const float* aPtr = aVector;
494 unsigned int number = 0;
496 for (number = 0; number < num_points; number++) {
497 *bPtr++ = asinf(*aPtr++);
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:76
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:358
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:153
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:490
for i
Definition: volk_config_fixed.tmpl.h:25