Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
72 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
73 
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 
77 
78 #ifdef LV_HAVE_GENERIC
79 
80 
81 static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
82  const float* input,
83  const float* taps,
84  unsigned int num_points)
85 {
86 
87  float dotProduct = 0;
88  const float* aPtr = input;
89  const float* bPtr = taps;
90  unsigned int number = 0;
91 
92  for (number = 0; number < num_points; number++) {
93  dotProduct += ((*aPtr++) * (*bPtr++));
94  }
95 
96  *result = dotProduct;
97 }
98 
99 #endif /*LV_HAVE_GENERIC*/
100 
101 
102 #ifdef LV_HAVE_SSE
103 
104 
105 static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
106  const float* input,
107  const float* taps,
108  unsigned int num_points)
109 {
110 
111  unsigned int number = 0;
112  const unsigned int sixteenthPoints = num_points / 16;
113 
114  float dotProduct = 0;
115  const float* aPtr = input;
116  const float* bPtr = taps;
117 
118  __m128 a0Val, a1Val, a2Val, a3Val;
119  __m128 b0Val, b1Val, b2Val, b3Val;
120  __m128 c0Val, c1Val, c2Val, c3Val;
121 
122  __m128 dotProdVal0 = _mm_setzero_ps();
123  __m128 dotProdVal1 = _mm_setzero_ps();
124  __m128 dotProdVal2 = _mm_setzero_ps();
125  __m128 dotProdVal3 = _mm_setzero_ps();
126 
127  for (; number < sixteenthPoints; number++) {
128 
129  a0Val = _mm_loadu_ps(aPtr);
130  a1Val = _mm_loadu_ps(aPtr + 4);
131  a2Val = _mm_loadu_ps(aPtr + 8);
132  a3Val = _mm_loadu_ps(aPtr + 12);
133  b0Val = _mm_loadu_ps(bPtr);
134  b1Val = _mm_loadu_ps(bPtr + 4);
135  b2Val = _mm_loadu_ps(bPtr + 8);
136  b3Val = _mm_loadu_ps(bPtr + 12);
137 
138  c0Val = _mm_mul_ps(a0Val, b0Val);
139  c1Val = _mm_mul_ps(a1Val, b1Val);
140  c2Val = _mm_mul_ps(a2Val, b2Val);
141  c3Val = _mm_mul_ps(a3Val, b3Val);
142 
143  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
144  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
145  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
146  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
147 
148  aPtr += 16;
149  bPtr += 16;
150  }
151 
152  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
153  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
154  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
155 
156  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
157 
158  _mm_store_ps(dotProductVector,
159  dotProdVal0); // Store the results back into the dot product vector
160 
161  dotProduct = dotProductVector[0];
162  dotProduct += dotProductVector[1];
163  dotProduct += dotProductVector[2];
164  dotProduct += dotProductVector[3];
165 
166  number = sixteenthPoints * 16;
167  for (; number < num_points; number++) {
168  dotProduct += ((*aPtr++) * (*bPtr++));
169  }
170 
171  *result = dotProduct;
172 }
173 
174 #endif /*LV_HAVE_SSE*/
175 
176 #ifdef LV_HAVE_SSE3
177 
178 #include <pmmintrin.h>
179 
180 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
181  const float* input,
182  const float* taps,
183  unsigned int num_points)
184 {
185  unsigned int number = 0;
186  const unsigned int sixteenthPoints = num_points / 16;
187 
188  float dotProduct = 0;
189  const float* aPtr = input;
190  const float* bPtr = taps;
191 
192  __m128 a0Val, a1Val, a2Val, a3Val;
193  __m128 b0Val, b1Val, b2Val, b3Val;
194  __m128 c0Val, c1Val, c2Val, c3Val;
195 
196  __m128 dotProdVal0 = _mm_setzero_ps();
197  __m128 dotProdVal1 = _mm_setzero_ps();
198  __m128 dotProdVal2 = _mm_setzero_ps();
199  __m128 dotProdVal3 = _mm_setzero_ps();
200 
201  for (; number < sixteenthPoints; number++) {
202 
203  a0Val = _mm_loadu_ps(aPtr);
204  a1Val = _mm_loadu_ps(aPtr + 4);
205  a2Val = _mm_loadu_ps(aPtr + 8);
206  a3Val = _mm_loadu_ps(aPtr + 12);
207  b0Val = _mm_loadu_ps(bPtr);
208  b1Val = _mm_loadu_ps(bPtr + 4);
209  b2Val = _mm_loadu_ps(bPtr + 8);
210  b3Val = _mm_loadu_ps(bPtr + 12);
211 
212  c0Val = _mm_mul_ps(a0Val, b0Val);
213  c1Val = _mm_mul_ps(a1Val, b1Val);
214  c2Val = _mm_mul_ps(a2Val, b2Val);
215  c3Val = _mm_mul_ps(a3Val, b3Val);
216 
217  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
218  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
219  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
220  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
221 
222  aPtr += 16;
223  bPtr += 16;
224  }
225 
226  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
227  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
228  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
229 
230  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
231  _mm_store_ps(dotProductVector,
232  dotProdVal0); // Store the results back into the dot product vector
233 
234  dotProduct = dotProductVector[0];
235  dotProduct += dotProductVector[1];
236  dotProduct += dotProductVector[2];
237  dotProduct += dotProductVector[3];
238 
239  number = sixteenthPoints * 16;
240  for (; number < num_points; number++) {
241  dotProduct += ((*aPtr++) * (*bPtr++));
242  }
243 
244  *result = dotProduct;
245 }
246 
247 #endif /*LV_HAVE_SSE3*/
248 
249 #ifdef LV_HAVE_SSE4_1
250 
251 #include <smmintrin.h>
252 
253 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
254  const float* input,
255  const float* taps,
256  unsigned int num_points)
257 {
258  unsigned int number = 0;
259  const unsigned int sixteenthPoints = num_points / 16;
260 
261  float dotProduct = 0;
262  const float* aPtr = input;
263  const float* bPtr = taps;
264 
265  __m128 aVal1, bVal1, cVal1;
266  __m128 aVal2, bVal2, cVal2;
267  __m128 aVal3, bVal3, cVal3;
268  __m128 aVal4, bVal4, cVal4;
269 
270  __m128 dotProdVal = _mm_setzero_ps();
271 
272  for (; number < sixteenthPoints; number++) {
273 
274  aVal1 = _mm_loadu_ps(aPtr);
275  aPtr += 4;
276  aVal2 = _mm_loadu_ps(aPtr);
277  aPtr += 4;
278  aVal3 = _mm_loadu_ps(aPtr);
279  aPtr += 4;
280  aVal4 = _mm_loadu_ps(aPtr);
281  aPtr += 4;
282 
283  bVal1 = _mm_loadu_ps(bPtr);
284  bPtr += 4;
285  bVal2 = _mm_loadu_ps(bPtr);
286  bPtr += 4;
287  bVal3 = _mm_loadu_ps(bPtr);
288  bPtr += 4;
289  bVal4 = _mm_loadu_ps(bPtr);
290  bPtr += 4;
291 
292  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
293  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
294  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
295  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
296 
297  cVal1 = _mm_or_ps(cVal1, cVal2);
298  cVal3 = _mm_or_ps(cVal3, cVal4);
299  cVal1 = _mm_or_ps(cVal1, cVal3);
300 
301  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
302  }
303 
304  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
305  _mm_store_ps(dotProductVector,
306  dotProdVal); // Store the results back into the dot product vector
307 
308  dotProduct = dotProductVector[0];
309  dotProduct += dotProductVector[1];
310  dotProduct += dotProductVector[2];
311  dotProduct += dotProductVector[3];
312 
313  number = sixteenthPoints * 16;
314  for (; number < num_points; number++) {
315  dotProduct += ((*aPtr++) * (*bPtr++));
316  }
317 
318  *result = dotProduct;
319 }
320 
321 #endif /*LV_HAVE_SSE4_1*/
322 
323 #ifdef LV_HAVE_AVX
324 
325 #include <immintrin.h>
326 
327 static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
328  const float* input,
329  const float* taps,
330  unsigned int num_points)
331 {
332 
333  unsigned int number = 0;
334  const unsigned int sixteenthPoints = num_points / 16;
335 
336  float dotProduct = 0;
337  const float* aPtr = input;
338  const float* bPtr = taps;
339 
340  __m256 a0Val, a1Val;
341  __m256 b0Val, b1Val;
342  __m256 c0Val, c1Val;
343 
344  __m256 dotProdVal0 = _mm256_setzero_ps();
345  __m256 dotProdVal1 = _mm256_setzero_ps();
346 
347  for (; number < sixteenthPoints; number++) {
348 
349  a0Val = _mm256_loadu_ps(aPtr);
350  a1Val = _mm256_loadu_ps(aPtr + 8);
351  b0Val = _mm256_loadu_ps(bPtr);
352  b1Val = _mm256_loadu_ps(bPtr + 8);
353 
354  c0Val = _mm256_mul_ps(a0Val, b0Val);
355  c1Val = _mm256_mul_ps(a1Val, b1Val);
356 
357  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
358  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
359 
360  aPtr += 16;
361  bPtr += 16;
362  }
363 
364  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
365 
366  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
367 
368  _mm256_storeu_ps(dotProductVector,
369  dotProdVal0); // Store the results back into the dot product vector
370 
371  dotProduct = dotProductVector[0];
372  dotProduct += dotProductVector[1];
373  dotProduct += dotProductVector[2];
374  dotProduct += dotProductVector[3];
375  dotProduct += dotProductVector[4];
376  dotProduct += dotProductVector[5];
377  dotProduct += dotProductVector[6];
378  dotProduct += dotProductVector[7];
379 
380  number = sixteenthPoints * 16;
381  for (; number < num_points; number++) {
382  dotProduct += ((*aPtr++) * (*bPtr++));
383  }
384 
385  *result = dotProduct;
386 }
387 
388 #endif /*LV_HAVE_AVX*/
389 
390 #if LV_HAVE_AVX2 && LV_HAVE_FMA
391 #include <immintrin.h>
392 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
393  const float* input,
394  const float* taps,
395  unsigned int num_points)
396 {
397  unsigned int number;
398  const unsigned int eighthPoints = num_points / 8;
399 
400  const float* aPtr = input;
401  const float* bPtr = taps;
402 
403  __m256 dotProdVal = _mm256_setzero_ps();
404  __m256 aVal1, bVal1;
405 
406  for (number = 0; number < eighthPoints; number++) {
407 
408  aVal1 = _mm256_loadu_ps(aPtr);
409  bVal1 = _mm256_loadu_ps(bPtr);
410  aPtr += 8;
411  bPtr += 8;
412 
413  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
414  }
415 
416  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
417  _mm256_storeu_ps(dotProductVector,
418  dotProdVal); // Store the results back into the dot product vector
419  _mm256_zeroupper();
420 
421  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
422  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
423  dotProductVector[6] + dotProductVector[7];
424 
425  for (number = eighthPoints * 8; number < num_points; number++) {
426  dotProduct += ((*aPtr++) * (*bPtr++));
427  }
428 
429  *result = dotProduct;
430 }
431 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
432 
433 #if LV_HAVE_AVX512F
434 #include <immintrin.h>
435 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
436  const float* input,
437  const float* taps,
438  unsigned int num_points)
439 {
440  unsigned int number;
441  const unsigned int sixteenthPoints = num_points / 16;
442 
443  const float* aPtr = input;
444  const float* bPtr = taps;
445 
446  __m512 dotProdVal = _mm512_setzero_ps();
447  __m512 aVal1, bVal1;
448 
449  for (number = 0; number < sixteenthPoints; number++) {
450 
451  aVal1 = _mm512_loadu_ps(aPtr);
452  bVal1 = _mm512_loadu_ps(bPtr);
453  aPtr += 16;
454  bPtr += 16;
455 
456  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
457  }
458 
459  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
460  _mm512_storeu_ps(dotProductVector,
461  dotProdVal); // Store the results back into the dot product vector
462 
463  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
464  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
465  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
466  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
467  dotProductVector[12] + dotProductVector[13] +
468  dotProductVector[14] + dotProductVector[15];
469 
470  for (number = sixteenthPoints * 16; number < num_points; number++) {
471  dotProduct += ((*aPtr++) * (*bPtr++));
472  }
473 
474  *result = dotProduct;
475 }
476 #endif /* LV_HAVE_AVX512F */
477 
478 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
479 
480 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
481 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
482 
483 #include <stdio.h>
484 #include <volk/volk_common.h>
485 
486 
487 #ifdef LV_HAVE_GENERIC
488 
489 
490 static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
491  const float* input,
492  const float* taps,
493  unsigned int num_points)
494 {
495 
496  float dotProduct = 0;
497  const float* aPtr = input;
498  const float* bPtr = taps;
499  unsigned int number = 0;
500 
501  for (number = 0; number < num_points; number++) {
502  dotProduct += ((*aPtr++) * (*bPtr++));
503  }
504 
505  *result = dotProduct;
506 }
507 
508 #endif /*LV_HAVE_GENERIC*/
509 
510 
511 #ifdef LV_HAVE_SSE
512 
513 
514 static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
515  const float* input,
516  const float* taps,
517  unsigned int num_points)
518 {
519 
520  unsigned int number = 0;
521  const unsigned int sixteenthPoints = num_points / 16;
522 
523  float dotProduct = 0;
524  const float* aPtr = input;
525  const float* bPtr = taps;
526 
527  __m128 a0Val, a1Val, a2Val, a3Val;
528  __m128 b0Val, b1Val, b2Val, b3Val;
529  __m128 c0Val, c1Val, c2Val, c3Val;
530 
531  __m128 dotProdVal0 = _mm_setzero_ps();
532  __m128 dotProdVal1 = _mm_setzero_ps();
533  __m128 dotProdVal2 = _mm_setzero_ps();
534  __m128 dotProdVal3 = _mm_setzero_ps();
535 
536  for (; number < sixteenthPoints; number++) {
537 
538  a0Val = _mm_load_ps(aPtr);
539  a1Val = _mm_load_ps(aPtr + 4);
540  a2Val = _mm_load_ps(aPtr + 8);
541  a3Val = _mm_load_ps(aPtr + 12);
542  b0Val = _mm_load_ps(bPtr);
543  b1Val = _mm_load_ps(bPtr + 4);
544  b2Val = _mm_load_ps(bPtr + 8);
545  b3Val = _mm_load_ps(bPtr + 12);
546 
547  c0Val = _mm_mul_ps(a0Val, b0Val);
548  c1Val = _mm_mul_ps(a1Val, b1Val);
549  c2Val = _mm_mul_ps(a2Val, b2Val);
550  c3Val = _mm_mul_ps(a3Val, b3Val);
551 
552  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
553  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
554  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
555  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
556 
557  aPtr += 16;
558  bPtr += 16;
559  }
560 
561  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
562  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
563  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
564 
565  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
566 
567  _mm_store_ps(dotProductVector,
568  dotProdVal0); // Store the results back into the dot product vector
569 
570  dotProduct = dotProductVector[0];
571  dotProduct += dotProductVector[1];
572  dotProduct += dotProductVector[2];
573  dotProduct += dotProductVector[3];
574 
575  number = sixteenthPoints * 16;
576  for (; number < num_points; number++) {
577  dotProduct += ((*aPtr++) * (*bPtr++));
578  }
579 
580  *result = dotProduct;
581 }
582 
583 #endif /*LV_HAVE_SSE*/
584 
585 #ifdef LV_HAVE_SSE3
586 
587 #include <pmmintrin.h>
588 
589 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
590  const float* input,
591  const float* taps,
592  unsigned int num_points)
593 {
594  unsigned int number = 0;
595  const unsigned int sixteenthPoints = num_points / 16;
596 
597  float dotProduct = 0;
598  const float* aPtr = input;
599  const float* bPtr = taps;
600 
601  __m128 a0Val, a1Val, a2Val, a3Val;
602  __m128 b0Val, b1Val, b2Val, b3Val;
603  __m128 c0Val, c1Val, c2Val, c3Val;
604 
605  __m128 dotProdVal0 = _mm_setzero_ps();
606  __m128 dotProdVal1 = _mm_setzero_ps();
607  __m128 dotProdVal2 = _mm_setzero_ps();
608  __m128 dotProdVal3 = _mm_setzero_ps();
609 
610  for (; number < sixteenthPoints; number++) {
611 
612  a0Val = _mm_load_ps(aPtr);
613  a1Val = _mm_load_ps(aPtr + 4);
614  a2Val = _mm_load_ps(aPtr + 8);
615  a3Val = _mm_load_ps(aPtr + 12);
616  b0Val = _mm_load_ps(bPtr);
617  b1Val = _mm_load_ps(bPtr + 4);
618  b2Val = _mm_load_ps(bPtr + 8);
619  b3Val = _mm_load_ps(bPtr + 12);
620 
621  c0Val = _mm_mul_ps(a0Val, b0Val);
622  c1Val = _mm_mul_ps(a1Val, b1Val);
623  c2Val = _mm_mul_ps(a2Val, b2Val);
624  c3Val = _mm_mul_ps(a3Val, b3Val);
625 
626  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
627  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
628  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
629  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
630 
631  aPtr += 16;
632  bPtr += 16;
633  }
634 
635  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
636  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
637  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
638 
639  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
640  _mm_store_ps(dotProductVector,
641  dotProdVal0); // Store the results back into the dot product vector
642 
643  dotProduct = dotProductVector[0];
644  dotProduct += dotProductVector[1];
645  dotProduct += dotProductVector[2];
646  dotProduct += dotProductVector[3];
647 
648  number = sixteenthPoints * 16;
649  for (; number < num_points; number++) {
650  dotProduct += ((*aPtr++) * (*bPtr++));
651  }
652 
653  *result = dotProduct;
654 }
655 
656 #endif /*LV_HAVE_SSE3*/
657 
658 #ifdef LV_HAVE_SSE4_1
659 
660 #include <smmintrin.h>
661 
662 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
663  const float* input,
664  const float* taps,
665  unsigned int num_points)
666 {
667  unsigned int number = 0;
668  const unsigned int sixteenthPoints = num_points / 16;
669 
670  float dotProduct = 0;
671  const float* aPtr = input;
672  const float* bPtr = taps;
673 
674  __m128 aVal1, bVal1, cVal1;
675  __m128 aVal2, bVal2, cVal2;
676  __m128 aVal3, bVal3, cVal3;
677  __m128 aVal4, bVal4, cVal4;
678 
679  __m128 dotProdVal = _mm_setzero_ps();
680 
681  for (; number < sixteenthPoints; number++) {
682 
683  aVal1 = _mm_load_ps(aPtr);
684  aPtr += 4;
685  aVal2 = _mm_load_ps(aPtr);
686  aPtr += 4;
687  aVal3 = _mm_load_ps(aPtr);
688  aPtr += 4;
689  aVal4 = _mm_load_ps(aPtr);
690  aPtr += 4;
691 
692  bVal1 = _mm_load_ps(bPtr);
693  bPtr += 4;
694  bVal2 = _mm_load_ps(bPtr);
695  bPtr += 4;
696  bVal3 = _mm_load_ps(bPtr);
697  bPtr += 4;
698  bVal4 = _mm_load_ps(bPtr);
699  bPtr += 4;
700 
701  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
702  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
703  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
704  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
705 
706  cVal1 = _mm_or_ps(cVal1, cVal2);
707  cVal3 = _mm_or_ps(cVal3, cVal4);
708  cVal1 = _mm_or_ps(cVal1, cVal3);
709 
710  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
711  }
712 
713  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
714  _mm_store_ps(dotProductVector,
715  dotProdVal); // Store the results back into the dot product vector
716 
717  dotProduct = dotProductVector[0];
718  dotProduct += dotProductVector[1];
719  dotProduct += dotProductVector[2];
720  dotProduct += dotProductVector[3];
721 
722  number = sixteenthPoints * 16;
723  for (; number < num_points; number++) {
724  dotProduct += ((*aPtr++) * (*bPtr++));
725  }
726 
727  *result = dotProduct;
728 }
729 
730 #endif /*LV_HAVE_SSE4_1*/
731 
732 #ifdef LV_HAVE_AVX
733 
734 #include <immintrin.h>
735 
736 static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
737  const float* input,
738  const float* taps,
739  unsigned int num_points)
740 {
741 
742  unsigned int number = 0;
743  const unsigned int sixteenthPoints = num_points / 16;
744 
745  float dotProduct = 0;
746  const float* aPtr = input;
747  const float* bPtr = taps;
748 
749  __m256 a0Val, a1Val;
750  __m256 b0Val, b1Val;
751  __m256 c0Val, c1Val;
752 
753  __m256 dotProdVal0 = _mm256_setzero_ps();
754  __m256 dotProdVal1 = _mm256_setzero_ps();
755 
756  for (; number < sixteenthPoints; number++) {
757 
758  a0Val = _mm256_load_ps(aPtr);
759  a1Val = _mm256_load_ps(aPtr + 8);
760  b0Val = _mm256_load_ps(bPtr);
761  b1Val = _mm256_load_ps(bPtr + 8);
762 
763  c0Val = _mm256_mul_ps(a0Val, b0Val);
764  c1Val = _mm256_mul_ps(a1Val, b1Val);
765 
766  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
767  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
768 
769  aPtr += 16;
770  bPtr += 16;
771  }
772 
773  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
774 
775  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
776 
777  _mm256_store_ps(dotProductVector,
778  dotProdVal0); // Store the results back into the dot product vector
779 
780  dotProduct = dotProductVector[0];
781  dotProduct += dotProductVector[1];
782  dotProduct += dotProductVector[2];
783  dotProduct += dotProductVector[3];
784  dotProduct += dotProductVector[4];
785  dotProduct += dotProductVector[5];
786  dotProduct += dotProductVector[6];
787  dotProduct += dotProductVector[7];
788 
789  number = sixteenthPoints * 16;
790  for (; number < num_points; number++) {
791  dotProduct += ((*aPtr++) * (*bPtr++));
792  }
793 
794  *result = dotProduct;
795 }
796 #endif /*LV_HAVE_AVX*/
797 
798 
799 #if LV_HAVE_AVX2 && LV_HAVE_FMA
800 #include <immintrin.h>
801 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
802  const float* input,
803  const float* taps,
804  unsigned int num_points)
805 {
806  unsigned int number;
807  const unsigned int eighthPoints = num_points / 8;
808 
809  const float* aPtr = input;
810  const float* bPtr = taps;
811 
812  __m256 dotProdVal = _mm256_setzero_ps();
813  __m256 aVal1, bVal1;
814 
815  for (number = 0; number < eighthPoints; number++) {
816 
817  aVal1 = _mm256_load_ps(aPtr);
818  bVal1 = _mm256_load_ps(bPtr);
819  aPtr += 8;
820  bPtr += 8;
821 
822  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
823  }
824 
825  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
826  _mm256_store_ps(dotProductVector,
827  dotProdVal); // Store the results back into the dot product vector
828  _mm256_zeroupper();
829 
830  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
831  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
832  dotProductVector[6] + dotProductVector[7];
833 
834  for (number = eighthPoints * 8; number < num_points; number++) {
835  dotProduct += ((*aPtr++) * (*bPtr++));
836  }
837 
838  *result = dotProduct;
839 }
840 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
841 
842 #if LV_HAVE_AVX512F
843 #include <immintrin.h>
844 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
845  const float* input,
846  const float* taps,
847  unsigned int num_points)
848 {
849  unsigned int number;
850  const unsigned int sixteenthPoints = num_points / 16;
851 
852  const float* aPtr = input;
853  const float* bPtr = taps;
854 
855  __m512 dotProdVal = _mm512_setzero_ps();
856  __m512 aVal1, bVal1;
857 
858  for (number = 0; number < sixteenthPoints; number++) {
859 
860  aVal1 = _mm512_load_ps(aPtr);
861  bVal1 = _mm512_load_ps(bPtr);
862  aPtr += 16;
863  bPtr += 16;
864 
865  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
866  }
867 
868  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
869  _mm512_store_ps(dotProductVector,
870  dotProdVal); // Store the results back into the dot product vector
871 
872  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
873  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
874  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
875  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
876  dotProductVector[12] + dotProductVector[13] +
877  dotProductVector[14] + dotProductVector[15];
878 
879  for (number = sixteenthPoints * 16; number < num_points; number++) {
880  dotProduct += ((*aPtr++) * (*bPtr++));
881  }
882 
883  *result = dotProduct;
884 }
885 #endif /* LV_HAVE_AVX512F */
886 
887 #ifdef LV_HAVE_NEON
888 #include <arm_neon.h>
889 
890 static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
891  const float* input,
892  const float* taps,
893  unsigned int num_points)
894 {
895 
896  unsigned int quarter_points = num_points / 16;
897  float dotProduct = 0;
898  const float* aPtr = input;
899  const float* bPtr = taps;
900  unsigned int number = 0;
901 
902  float32x4x4_t a_val, b_val, accumulator0;
903  accumulator0.val[0] = vdupq_n_f32(0);
904  accumulator0.val[1] = vdupq_n_f32(0);
905  accumulator0.val[2] = vdupq_n_f32(0);
906  accumulator0.val[3] = vdupq_n_f32(0);
907  // factor of 4 loop unroll with independent accumulators
908  // uses 12 out of 16 neon q registers
909  for (number = 0; number < quarter_points; ++number) {
910  a_val = vld4q_f32(aPtr);
911  b_val = vld4q_f32(bPtr);
912  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
913  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
914  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
915  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
916  aPtr += 16;
917  bPtr += 16;
918  }
919  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
920  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
921  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
922  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
923  vst1q_f32(accumulator, accumulator0.val[0]);
924  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
925 
926  for (number = quarter_points * 16; number < num_points; number++) {
927  dotProduct += ((*aPtr++) * (*bPtr++));
928  }
929 
930  *result = dotProduct;
931 }
932 
933 #endif
934 
935 
936 #ifdef LV_HAVE_NEON
937 static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
938  const float* input,
939  const float* taps,
940  unsigned int num_points)
941 {
942 
943  unsigned int quarter_points = num_points / 8;
944  float dotProduct = 0;
945  const float* aPtr = input;
946  const float* bPtr = taps;
947  unsigned int number = 0;
948 
949  float32x4x2_t a_val, b_val, accumulator_val;
950  accumulator_val.val[0] = vdupq_n_f32(0);
951  accumulator_val.val[1] = vdupq_n_f32(0);
952  // factor of 2 loop unroll with independent accumulators
953  for (number = 0; number < quarter_points; ++number) {
954  a_val = vld2q_f32(aPtr);
955  b_val = vld2q_f32(bPtr);
956  accumulator_val.val[0] =
957  vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
958  accumulator_val.val[1] =
959  vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
960  aPtr += 8;
961  bPtr += 8;
962  }
963  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
964  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
965  vst1q_f32(accumulator, accumulator_val.val[0]);
966  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
967 
968  for (number = quarter_points * 8; number < num_points; number++) {
969  dotProduct += ((*aPtr++) * (*bPtr++));
970  }
971 
972  *result = dotProduct;
973 }
974 
975 #endif /* LV_HAVE_NEON */
976 
977 #ifdef LV_HAVE_NEONV7
978 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
979  const float* aVector,
980  const float* bVector,
981  unsigned int num_points);
982 #endif /* LV_HAVE_NEONV7 */
983 
984 #ifdef LV_HAVE_NEONV7
985 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
986  const float* aVector,
987  const float* bVector,
988  unsigned int num_points);
989 #endif /* LV_HAVE_NEONV7 */
990 
991 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:736
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:514
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:105
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:327
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:81
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:180
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:490
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:589
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:890
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:937
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56