58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
65 #ifdef LV_HAVE_GENERIC
70 unsigned int num_points)
73 static const int N_UNROLL = 4;
81 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
83 for (
i = 0;
i < n;
i += N_UNROLL) {
84 acc0 += taps[
i + 0] * (float)input[
i + 0];
85 acc1 += taps[
i + 1] * (float)input[
i + 1];
86 acc2 += taps[
i + 2] * (float)input[
i + 2];
87 acc3 += taps[
i + 3] * (float)input[
i + 3];
90 for (;
i < num_points;
i++) {
91 acc0 += taps[
i] * (float)input[
i];
94 *result = acc0 + acc1 + acc2 + acc3;
100 #include <arm_neon.h>
104 unsigned int num_points)
108 unsigned quarter_points = num_points / 4;
110 short* inputPtr = (
short*)input;
113 float32x4x2_t tapsVal, accumulator_val;
116 float32x4_t input_float, prod_re, prod_im;
118 accumulator_val.val[0] = vdupq_n_f32(0.0);
119 accumulator_val.val[1] = vdupq_n_f32(0.0);
121 for (ii = 0; ii < quarter_points; ++ii) {
122 tapsVal = vld2q_f32((
float*)tapsPtr);
123 input16 = vld1_s16(inputPtr);
125 input32 = vmovl_s16(input16);
127 input_float = vcvtq_f32_s32(input32);
129 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
130 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
132 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
133 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
138 vst2q_f32((
float*)accumulator_vec, accumulator_val);
139 accumulator_vec[0] += accumulator_vec[1];
140 accumulator_vec[2] += accumulator_vec[3];
141 accumulator_vec[0] += accumulator_vec[2];
143 for (ii = quarter_points * 4; ii < num_points; ++ii) {
144 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
147 *result = accumulator_vec[0];
152 #if LV_HAVE_SSE && LV_HAVE_MMX
154 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
157 unsigned int num_points)
160 unsigned int number = 0;
161 const unsigned int sixteenthPoints = num_points / 8;
164 float *realpt = &res[0], *imagpt = &res[1];
165 const short* aPtr = input;
166 const float* bPtr = (
float*)taps;
169 __m128 f0, f1, f2, f3;
170 __m128 a0Val, a1Val, a2Val, a3Val;
171 __m128 b0Val, b1Val, b2Val, b3Val;
172 __m128 c0Val, c1Val, c2Val, c3Val;
174 __m128 dotProdVal0 = _mm_setzero_ps();
175 __m128 dotProdVal1 = _mm_setzero_ps();
176 __m128 dotProdVal2 = _mm_setzero_ps();
177 __m128 dotProdVal3 = _mm_setzero_ps();
179 for (; number < sixteenthPoints; number++) {
181 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
182 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
183 f0 = _mm_cvtpi16_ps(m0);
184 f1 = _mm_cvtpi16_ps(m0);
185 f2 = _mm_cvtpi16_ps(m1);
186 f3 = _mm_cvtpi16_ps(m1);
188 a0Val = _mm_unpacklo_ps(f0, f1);
189 a1Val = _mm_unpackhi_ps(f0, f1);
190 a2Val = _mm_unpacklo_ps(f2, f3);
191 a3Val = _mm_unpackhi_ps(f2, f3);
193 b0Val = _mm_loadu_ps(bPtr);
194 b1Val = _mm_loadu_ps(bPtr + 4);
195 b2Val = _mm_loadu_ps(bPtr + 8);
196 b3Val = _mm_loadu_ps(bPtr + 12);
198 c0Val = _mm_mul_ps(a0Val, b0Val);
199 c1Val = _mm_mul_ps(a1Val, b1Val);
200 c2Val = _mm_mul_ps(a2Val, b2Val);
201 c3Val = _mm_mul_ps(a3Val, b3Val);
203 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
204 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
205 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
206 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
215 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
216 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
220 _mm_store_ps(dotProductVector,
223 *realpt = dotProductVector[0];
224 *imagpt = dotProductVector[1];
225 *realpt += dotProductVector[2];
226 *imagpt += dotProductVector[3];
228 number = sixteenthPoints * 8;
229 for (; number < num_points; number++) {
230 *realpt += ((*aPtr) * (*bPtr++));
231 *imagpt += ((*aPtr++) * (*bPtr++));
240 #if LV_HAVE_AVX2 && LV_HAVE_FMA
242 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
245 unsigned int num_points)
248 unsigned int number = 0;
249 const unsigned int sixteenthPoints = num_points / 16;
252 float *realpt = &res[0], *imagpt = &res[1];
253 const short* aPtr = input;
254 const float* bPtr = (
float*)taps;
258 __m256 g0, g1, h0, h1, h2, h3;
259 __m256 a0Val, a1Val, a2Val, a3Val;
260 __m256 b0Val, b1Val, b2Val, b3Val;
262 __m256 dotProdVal0 = _mm256_setzero_ps();
263 __m256 dotProdVal1 = _mm256_setzero_ps();
264 __m256 dotProdVal2 = _mm256_setzero_ps();
265 __m256 dotProdVal3 = _mm256_setzero_ps();
267 for (; number < sixteenthPoints; number++) {
269 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
270 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
272 f0 = _mm256_cvtepi16_epi32(m0);
273 g0 = _mm256_cvtepi32_ps(f0);
274 f1 = _mm256_cvtepi16_epi32(m1);
275 g1 = _mm256_cvtepi32_ps(f1);
277 h0 = _mm256_unpacklo_ps(g0, g0);
278 h1 = _mm256_unpackhi_ps(g0, g0);
279 h2 = _mm256_unpacklo_ps(g1, g1);
280 h3 = _mm256_unpackhi_ps(g1, g1);
282 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
283 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
284 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
285 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
287 b0Val = _mm256_loadu_ps(bPtr);
288 b1Val = _mm256_loadu_ps(bPtr + 8);
289 b2Val = _mm256_loadu_ps(bPtr + 16);
290 b3Val = _mm256_loadu_ps(bPtr + 24);
292 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
293 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
294 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
295 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
301 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
302 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
303 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
307 _mm256_store_ps(dotProductVector,
310 *realpt = dotProductVector[0];
311 *imagpt = dotProductVector[1];
312 *realpt += dotProductVector[2];
313 *imagpt += dotProductVector[3];
314 *realpt += dotProductVector[4];
315 *imagpt += dotProductVector[5];
316 *realpt += dotProductVector[6];
317 *imagpt += dotProductVector[7];
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *realpt += ((*aPtr) * (*bPtr++));
322 *imagpt += ((*aPtr++) * (*bPtr++));
333 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
336 unsigned int num_points)
339 unsigned int number = 0;
340 const unsigned int sixteenthPoints = num_points / 16;
343 float *realpt = &res[0], *imagpt = &res[1];
344 const short* aPtr = input;
345 const float* bPtr = (
float*)taps;
349 __m256 g0, g1, h0, h1, h2, h3;
350 __m256 a0Val, a1Val, a2Val, a3Val;
351 __m256 b0Val, b1Val, b2Val, b3Val;
352 __m256 c0Val, c1Val, c2Val, c3Val;
354 __m256 dotProdVal0 = _mm256_setzero_ps();
355 __m256 dotProdVal1 = _mm256_setzero_ps();
356 __m256 dotProdVal2 = _mm256_setzero_ps();
357 __m256 dotProdVal3 = _mm256_setzero_ps();
359 for (; number < sixteenthPoints; number++) {
361 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
362 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
364 f0 = _mm256_cvtepi16_epi32(m0);
365 g0 = _mm256_cvtepi32_ps(f0);
366 f1 = _mm256_cvtepi16_epi32(m1);
367 g1 = _mm256_cvtepi32_ps(f1);
369 h0 = _mm256_unpacklo_ps(g0, g0);
370 h1 = _mm256_unpackhi_ps(g0, g0);
371 h2 = _mm256_unpacklo_ps(g1, g1);
372 h3 = _mm256_unpackhi_ps(g1, g1);
374 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
375 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
376 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
377 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
379 b0Val = _mm256_loadu_ps(bPtr);
380 b1Val = _mm256_loadu_ps(bPtr + 8);
381 b2Val = _mm256_loadu_ps(bPtr + 16);
382 b3Val = _mm256_loadu_ps(bPtr + 24);
384 c0Val = _mm256_mul_ps(a0Val, b0Val);
385 c1Val = _mm256_mul_ps(a1Val, b1Val);
386 c2Val = _mm256_mul_ps(a2Val, b2Val);
387 c3Val = _mm256_mul_ps(a3Val, b3Val);
389 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
390 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
391 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
392 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
398 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
404 _mm256_store_ps(dotProductVector,
407 *realpt = dotProductVector[0];
408 *imagpt = dotProductVector[1];
409 *realpt += dotProductVector[2];
410 *imagpt += dotProductVector[3];
411 *realpt += dotProductVector[4];
412 *imagpt += dotProductVector[5];
413 *realpt += dotProductVector[6];
414 *imagpt += dotProductVector[7];
416 number = sixteenthPoints * 16;
417 for (; number < num_points; number++) {
418 *realpt += ((*aPtr) * (*bPtr++));
419 *imagpt += ((*aPtr++) * (*bPtr++));
428 #if LV_HAVE_SSE && LV_HAVE_MMX
431 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
434 unsigned int num_points)
437 unsigned int number = 0;
438 const unsigned int sixteenthPoints = num_points / 8;
441 float *realpt = &res[0], *imagpt = &res[1];
442 const short* aPtr = input;
443 const float* bPtr = (
float*)taps;
446 __m128 f0, f1, f2, f3;
447 __m128 a0Val, a1Val, a2Val, a3Val;
448 __m128 b0Val, b1Val, b2Val, b3Val;
449 __m128 c0Val, c1Val, c2Val, c3Val;
451 __m128 dotProdVal0 = _mm_setzero_ps();
452 __m128 dotProdVal1 = _mm_setzero_ps();
453 __m128 dotProdVal2 = _mm_setzero_ps();
454 __m128 dotProdVal3 = _mm_setzero_ps();
456 for (; number < sixteenthPoints; number++) {
458 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
459 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
460 f0 = _mm_cvtpi16_ps(m0);
461 f1 = _mm_cvtpi16_ps(m0);
462 f2 = _mm_cvtpi16_ps(m1);
463 f3 = _mm_cvtpi16_ps(m1);
465 a0Val = _mm_unpacklo_ps(f0, f1);
466 a1Val = _mm_unpackhi_ps(f0, f1);
467 a2Val = _mm_unpacklo_ps(f2, f3);
468 a3Val = _mm_unpackhi_ps(f2, f3);
470 b0Val = _mm_load_ps(bPtr);
471 b1Val = _mm_load_ps(bPtr + 4);
472 b2Val = _mm_load_ps(bPtr + 8);
473 b3Val = _mm_load_ps(bPtr + 12);
475 c0Val = _mm_mul_ps(a0Val, b0Val);
476 c1Val = _mm_mul_ps(a1Val, b1Val);
477 c2Val = _mm_mul_ps(a2Val, b2Val);
478 c3Val = _mm_mul_ps(a3Val, b3Val);
480 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
481 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
482 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
483 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
491 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
492 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
493 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
497 _mm_store_ps(dotProductVector,
500 *realpt = dotProductVector[0];
501 *imagpt = dotProductVector[1];
502 *realpt += dotProductVector[2];
503 *imagpt += dotProductVector[3];
505 number = sixteenthPoints * 8;
506 for (; number < num_points; number++) {
507 *realpt += ((*aPtr) * (*bPtr++));
508 *imagpt += ((*aPtr++) * (*bPtr++));
518 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
521 unsigned int num_points)
524 unsigned int number = 0;
525 const unsigned int sixteenthPoints = num_points / 16;
528 float *realpt = &res[0], *imagpt = &res[1];
529 const short* aPtr = input;
530 const float* bPtr = (
float*)taps;
534 __m256 g0, g1, h0, h1, h2, h3;
535 __m256 a0Val, a1Val, a2Val, a3Val;
536 __m256 b0Val, b1Val, b2Val, b3Val;
537 __m256 c0Val, c1Val, c2Val, c3Val;
539 __m256 dotProdVal0 = _mm256_setzero_ps();
540 __m256 dotProdVal1 = _mm256_setzero_ps();
541 __m256 dotProdVal2 = _mm256_setzero_ps();
542 __m256 dotProdVal3 = _mm256_setzero_ps();
544 for (; number < sixteenthPoints; number++) {
546 m0 = _mm_load_si128((__m128i
const*)aPtr);
547 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
549 f0 = _mm256_cvtepi16_epi32(m0);
550 g0 = _mm256_cvtepi32_ps(f0);
551 f1 = _mm256_cvtepi16_epi32(m1);
552 g1 = _mm256_cvtepi32_ps(f1);
554 h0 = _mm256_unpacklo_ps(g0, g0);
555 h1 = _mm256_unpackhi_ps(g0, g0);
556 h2 = _mm256_unpacklo_ps(g1, g1);
557 h3 = _mm256_unpackhi_ps(g1, g1);
559 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
560 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
561 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
562 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
564 b0Val = _mm256_load_ps(bPtr);
565 b1Val = _mm256_load_ps(bPtr + 8);
566 b2Val = _mm256_load_ps(bPtr + 16);
567 b3Val = _mm256_load_ps(bPtr + 24);
569 c0Val = _mm256_mul_ps(a0Val, b0Val);
570 c1Val = _mm256_mul_ps(a1Val, b1Val);
571 c2Val = _mm256_mul_ps(a2Val, b2Val);
572 c3Val = _mm256_mul_ps(a3Val, b3Val);
574 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
575 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
576 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
577 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
583 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
584 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
585 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
589 _mm256_store_ps(dotProductVector,
592 *realpt = dotProductVector[0];
593 *imagpt = dotProductVector[1];
594 *realpt += dotProductVector[2];
595 *imagpt += dotProductVector[3];
596 *realpt += dotProductVector[4];
597 *imagpt += dotProductVector[5];
598 *realpt += dotProductVector[6];
599 *imagpt += dotProductVector[7];
601 number = sixteenthPoints * 16;
602 for (; number < num_points; number++) {
603 *realpt += ((*aPtr) * (*bPtr++));
604 *imagpt += ((*aPtr++) * (*bPtr++));
613 #if LV_HAVE_AVX2 && LV_HAVE_FMA
615 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
618 unsigned int num_points)
621 unsigned int number = 0;
622 const unsigned int sixteenthPoints = num_points / 16;
625 float *realpt = &res[0], *imagpt = &res[1];
626 const short* aPtr = input;
627 const float* bPtr = (
float*)taps;
631 __m256 g0, g1, h0, h1, h2, h3;
632 __m256 a0Val, a1Val, a2Val, a3Val;
633 __m256 b0Val, b1Val, b2Val, b3Val;
635 __m256 dotProdVal0 = _mm256_setzero_ps();
636 __m256 dotProdVal1 = _mm256_setzero_ps();
637 __m256 dotProdVal2 = _mm256_setzero_ps();
638 __m256 dotProdVal3 = _mm256_setzero_ps();
640 for (; number < sixteenthPoints; number++) {
642 m0 = _mm_load_si128((__m128i
const*)aPtr);
643 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
645 f0 = _mm256_cvtepi16_epi32(m0);
646 g0 = _mm256_cvtepi32_ps(f0);
647 f1 = _mm256_cvtepi16_epi32(m1);
648 g1 = _mm256_cvtepi32_ps(f1);
650 h0 = _mm256_unpacklo_ps(g0, g0);
651 h1 = _mm256_unpackhi_ps(g0, g0);
652 h2 = _mm256_unpacklo_ps(g1, g1);
653 h3 = _mm256_unpackhi_ps(g1, g1);
655 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
656 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
657 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
658 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
660 b0Val = _mm256_load_ps(bPtr);
661 b1Val = _mm256_load_ps(bPtr + 8);
662 b2Val = _mm256_load_ps(bPtr + 16);
663 b3Val = _mm256_load_ps(bPtr + 24);
665 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
666 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
667 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
668 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
674 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
675 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
676 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
680 _mm256_store_ps(dotProductVector,
683 *realpt = dotProductVector[0];
684 *imagpt = dotProductVector[1];
685 *realpt += dotProductVector[2];
686 *imagpt += dotProductVector[3];
687 *realpt += dotProductVector[4];
688 *imagpt += dotProductVector[5];
689 *realpt += dotProductVector[6];
690 *imagpt += dotProductVector[7];
692 number = sixteenthPoints * 16;
693 for (; number < num_points; number++) {
694 *realpt += ((*aPtr) * (*bPtr++));
695 *imagpt += ((*aPtr++) * (*bPtr++));
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:101
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:70
for i
Definition: volk_config_fixed.tmpl.h:25