86 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
87 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
95 #ifdef LV_HAVE_GENERIC
102 unsigned int num_points)
107 unsigned int number = num_points;
110 while (number >= 8) {
111 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
112 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
113 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
114 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
115 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
116 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
117 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
118 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
123 while (number-- > 0) {
124 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
131 #include <immintrin.h>
139 unsigned int num_points)
141 unsigned int number = 0;
143 const unsigned int quarterPoints = num_points / 4;
144 unsigned int isodd = num_points & 3;
147 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
154 s = _mm256_loadu_ps((
float*)v_scalar);
156 for (; number < quarterPoints; number++) {
157 x = _mm256_loadu_ps((
float*)b);
158 y = _mm256_loadu_ps((
float*)a);
160 z = _mm256_add_ps(y, z);
161 _mm256_storeu_ps((
float*)c, z);
168 for (
i = num_points - isodd;
i < num_points;
i++) {
169 *c++ = (*a++) +
lv_conj(*b++) * scalar;
176 #include <pmmintrin.h>
184 unsigned int num_points)
186 unsigned int number = 0;
187 const unsigned int halfPoints = num_points / 2;
190 lv_32fc_t v_scalar[2] = { scalar, scalar };
197 s = _mm_loadu_ps((
float*)v_scalar);
199 for (; number < halfPoints; number++) {
200 x = _mm_loadu_ps((
float*)b);
201 y = _mm_loadu_ps((
float*)a);
203 z = _mm_add_ps(y, z);
204 _mm_storeu_ps((
float*)c, z);
211 if ((num_points % 2) != 0) {
212 *c = *a +
lv_conj(*b) * scalar;
219 #include <immintrin.h>
227 unsigned int num_points)
229 unsigned int number = 0;
231 const unsigned int quarterPoints = num_points / 4;
232 unsigned int isodd = num_points & 3;
235 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
242 s = _mm256_loadu_ps((
float*)v_scalar);
244 for (; number < quarterPoints; number++) {
245 x = _mm256_load_ps((
float*)b);
246 y = _mm256_load_ps((
float*)a);
248 z = _mm256_add_ps(y, z);
249 _mm256_store_ps((
float*)c, z);
256 for (
i = num_points - isodd;
i < num_points;
i++) {
257 *c++ = (*a++) +
lv_conj(*b++) * scalar;
264 #include <pmmintrin.h>
272 unsigned int num_points)
274 unsigned int number = 0;
275 const unsigned int halfPoints = num_points / 2;
278 lv_32fc_t v_scalar[2] = { scalar, scalar };
285 s = _mm_loadu_ps((
float*)v_scalar);
287 for (; number < halfPoints; number++) {
288 x = _mm_load_ps((
float*)b);
289 y = _mm_load_ps((
float*)a);
291 z = _mm_add_ps(y, z);
292 _mm_store_ps((
float*)c, z);
299 if ((num_points % 2) != 0) {
300 *c = *a +
lv_conj(*b) * scalar;
307 #include <arm_neon.h>
314 unsigned int num_points)
319 unsigned int number = num_points;
320 unsigned int quarter_points = num_points / 4;
322 float32x4x2_t a_val, b_val, c_val, scalar_val;
323 float32x4x2_t tmp_val;
325 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
326 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
328 for (number = 0; number < quarter_points; ++number) {
329 a_val = vld2q_f32((
float*)aPtr);
330 b_val = vld2q_f32((
float*)bPtr);
331 b_val.val[1] = vnegq_f32(b_val.val[1]);
335 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
336 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
338 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
339 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
341 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
342 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
344 vst2q_f32((
float*)cPtr, c_val);
351 for (number = quarter_points * 4; number < num_points; number++) {
352 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:223
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:98
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:180
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:268
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:135
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:310
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define lv_conj(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:70
for i
Definition: volk_config_fixed.tmpl.h:25
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44