47 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
48 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
53 #ifdef LV_HAVE_GENERIC
58 unsigned int num_points)
61 for (n = 0; n < num_points; n++) {
62 result[n] = in_a[n] * in_b[n];
70 #include <emmintrin.h>
75 unsigned int num_points)
77 const unsigned int sse_iters = num_points / 4;
78 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
81 mask_imag = _mm_set_epi8(
82 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
83 mask_real = _mm_set_epi8(
84 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
91 for (number = 0; number < sse_iters; number++) {
94 b = _mm_load_si128((__m128i*)_in_b);
95 c = _mm_mullo_epi16(a, b);
97 c_sr = _mm_srli_si128(c, 2);
99 real = _mm_subs_epi16(c, c_sr);
100 real = _mm_and_si128(real,
103 b_sl = _mm_slli_si128(b, 2);
104 a_sl = _mm_slli_si128(a, 2);
106 imag1 = _mm_mullo_epi16(a, b_sl);
107 imag2 = _mm_mullo_epi16(b, a_sl);
109 imag = _mm_adds_epi16(imag1, imag2);
110 imag = _mm_and_si128(imag, mask_imag);
112 result = _mm_or_si128(real, imag);
114 _mm_store_si128((__m128i*)_out, result);
121 for (number = sse_iters * 4; number < num_points; ++number) {
122 *_out++ = (*_in_a++) * (*_in_b++);
129 #include <emmintrin.h>
134 unsigned int num_points)
136 const unsigned int sse_iters = num_points / 4;
137 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
140 mask_imag = _mm_set_epi8(
141 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
142 mask_real = _mm_set_epi8(
143 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
150 for (number = 0; number < sse_iters; number++) {
153 b = _mm_loadu_si128((__m128i*)_in_b);
154 c = _mm_mullo_epi16(a, b);
156 c_sr = _mm_srli_si128(c, 2);
158 real = _mm_subs_epi16(c, c_sr);
159 real = _mm_and_si128(real,
162 b_sl = _mm_slli_si128(b, 2);
163 a_sl = _mm_slli_si128(a, 2);
165 imag1 = _mm_mullo_epi16(a, b_sl);
166 imag2 = _mm_mullo_epi16(b, a_sl);
168 imag = _mm_adds_epi16(imag1, imag2);
169 imag = _mm_and_si128(imag, mask_imag);
171 result = _mm_or_si128(real, imag);
173 _mm_storeu_si128((__m128i*)_out, result);
180 for (number = sse_iters * 4; number < num_points; ++number) {
181 *_out++ = (*_in_a++) * (*_in_b++);
188 #include <immintrin.h>
190 static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
193 unsigned int num_points)
195 unsigned int number = 0;
196 const unsigned int avx2_points = num_points / 8;
202 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
204 const __m256i mask_imag = _mm256_set_epi8(0xFF,
236 const __m256i mask_real = _mm256_set_epi8(0,
269 for (; number < avx2_points; number++) {
270 a = _mm256_loadu_si256(
272 b = _mm256_loadu_si256(
274 c = _mm256_mullo_epi16(a, b);
276 c_sr = _mm256_srli_si256(c, 2);
278 real = _mm256_subs_epi16(c, c_sr);
279 real = _mm256_and_si256(
282 b_sl = _mm256_slli_si256(b, 2);
283 a_sl = _mm256_slli_si256(a, 2);
285 imag1 = _mm256_mullo_epi16(a, b_sl);
286 imag2 = _mm256_mullo_epi16(b, a_sl);
288 imag = _mm256_adds_epi16(imag1, imag2);
289 imag = _mm256_and_si256(imag, mask_imag);
291 result = _mm256_or_si256(real, imag);
293 _mm256_storeu_si256((__m256i*)_out, result);
300 number = avx2_points * 8;
301 for (; number < num_points; number++) {
302 *_out++ = (*_in_a++) * (*_in_b++);
309 #include <immintrin.h>
311 static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
314 unsigned int num_points)
316 unsigned int number = 0;
317 const unsigned int avx2_points = num_points / 8;
323 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
325 const __m256i mask_imag = _mm256_set_epi8(0xFF,
357 const __m256i mask_real = _mm256_set_epi8(0,
390 for (; number < avx2_points; number++) {
391 a = _mm256_load_si256(
393 b = _mm256_load_si256(
395 c = _mm256_mullo_epi16(a, b);
397 c_sr = _mm256_srli_si256(c, 2);
399 real = _mm256_subs_epi16(c, c_sr);
400 real = _mm256_and_si256(
403 b_sl = _mm256_slli_si256(b, 2);
404 a_sl = _mm256_slli_si256(a, 2);
406 imag1 = _mm256_mullo_epi16(a, b_sl);
407 imag2 = _mm256_mullo_epi16(b, a_sl);
409 imag = _mm256_adds_epi16(imag1, imag2);
410 imag = _mm256_and_si256(imag, mask_imag);
412 result = _mm256_or_si256(real, imag);
414 _mm256_store_si256((__m256i*)_out, result);
421 number = avx2_points * 8;
422 for (; number < num_points; number++) {
423 *_out++ = (*_in_a++) * (*_in_b++);
430 #include <arm_neon.h>
435 unsigned int num_points)
439 unsigned int quarter_points = num_points / 4;
440 int16x4x2_t a_val, b_val, c_val;
441 int16x4x2_t tmp_real, tmp_imag;
442 unsigned int number = 0;
444 for (number = 0; number < quarter_points; ++number) {
445 a_val = vld2_s16((int16_t*)a_ptr);
446 b_val = vld2_s16((int16_t*)b_ptr);
452 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
454 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
458 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
460 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
463 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
464 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
465 vst2_s16((int16_t*)out, c_val);
472 for (number = quarter_points * 4; number < num_points; number++) {
473 *out++ = (*a_ptr++) * (*b_ptr++);
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:72
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:131
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:55
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:432
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
short complex lv_16sc_t
Definition: volk_complex.h:67