76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77 #define INCLUDED_volk_32fc_index_max_16u_a_H
86 #include <immintrin.h>
89 volk_32fc_index_max_16u_a_avx2(uint16_t* target,
lv_32fc_t* src0, uint32_t num_points)
91 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
92 const uint32_t num_bytes = num_points * 8;
101 __m256 xmm1, xmm2, xmm3;
102 __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
104 xmm5.
int_vec = _mm256_setzero_si256();
105 xmm4.int_vec = _mm256_setzero_si256();
106 holderf.int_vec = _mm256_setzero_si256();
107 holderi.int_vec = _mm256_setzero_si256();
109 int bound = num_bytes >> 6;
112 xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
113 xmm9 = _mm256_setzero_si256();
114 xmm10 = _mm256_set1_epi32(8);
115 xmm3 = _mm256_setzero_ps();
117 __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
118 for (;
i < bound; ++
i) {
119 xmm1 = _mm256_load_ps((
float*)src0);
120 xmm2 = _mm256_load_ps((
float*)&src0[4]);
124 xmm1 = _mm256_mul_ps(xmm1, xmm1);
125 xmm2 = _mm256_mul_ps(xmm2, xmm2);
127 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
128 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
130 xmm3 = _mm256_max_ps(xmm1, xmm3);
132 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
133 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
135 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
136 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
138 xmm9 = _mm256_add_epi32(xmm11, xmm12);
140 xmm8 = _mm256_add_epi32(xmm8, xmm10);
143 _mm256_store_ps((
float*)&(holderf.f), xmm3);
144 _mm256_store_si256(&(holderi.int_vec), xmm9);
146 for (
i = 0;
i < 8;
i++) {
147 if (holderf.f[
i] > max) {
148 index = holderi.i[
i];
153 for (
i = bound * 8;
i < num_points;
i++, src0++) {
168 #include <pmmintrin.h>
169 #include <xmmintrin.h>
174 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
175 const uint32_t num_bytes = num_points * 8;
182 __m128 xmm1, xmm2, xmm3;
183 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
185 xmm5.
int_vec = _mm_setzero_si128();
186 xmm4.
int_vec = _mm_setzero_si128();
187 holderf.
int_vec = _mm_setzero_si128();
188 holderi.
int_vec = _mm_setzero_si128();
190 int bound = num_bytes >> 5;
193 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
194 xmm9 = _mm_setzero_si128();
195 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
196 xmm3 = _mm_setzero_ps();
198 for (;
i < bound; ++
i) {
199 xmm1 = _mm_load_ps((
float*)src0);
200 xmm2 = _mm_load_ps((
float*)&src0[2]);
204 xmm1 = _mm_mul_ps(xmm1, xmm1);
205 xmm2 = _mm_mul_ps(xmm2, xmm2);
207 xmm1 = _mm_hadd_ps(xmm1, xmm2);
209 xmm3 = _mm_max_ps(xmm1, xmm3);
211 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
212 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
214 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
215 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
217 xmm9 = _mm_add_epi32(xmm11, xmm12);
219 xmm8 = _mm_add_epi32(xmm8, xmm10);
222 if (num_bytes >> 4 & 1) {
223 xmm2 = _mm_load_ps((
float*)src0);
228 xmm2 = _mm_mul_ps(xmm2, xmm2);
232 xmm1 = _mm_hadd_ps(xmm2, xmm2);
234 xmm3 = _mm_max_ps(xmm1, xmm3);
236 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
238 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
239 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
241 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
242 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
244 xmm9 = _mm_add_epi32(xmm11, xmm12);
246 xmm8 = _mm_add_epi32(xmm8, xmm10);
249 if (num_bytes >> 3 & 1) {
253 xmm2 = _mm_load1_ps(&sq_dist);
257 xmm3 = _mm_max_ss(xmm3, xmm2);
259 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
260 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
262 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
264 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
265 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
267 xmm9 = _mm_add_epi32(xmm11, xmm12);
270 _mm_store_ps((
float*)&(holderf.
f), xmm3);
271 _mm_store_si128(&(holderi.
int_vec), xmm9);
273 target[0] = holderi.
i[0];
274 sq_dist = holderf.
f[0];
275 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
276 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
277 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
278 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
279 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
280 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
285 #ifdef LV_HAVE_GENERIC
289 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
291 const uint32_t num_bytes = num_points * 8;
299 for (; i<num_bytes>> 3; ++
i) {
315 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
316 #define INCLUDED_volk_32fc_index_max_16u_u_H
318 #include <inttypes.h>
325 #include <immintrin.h>
328 volk_32fc_index_max_16u_u_avx2(uint16_t* target,
lv_32fc_t* src0, uint32_t num_points)
330 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
331 const uint32_t num_bytes = num_points * 8;
340 __m256 xmm1, xmm2, xmm3;
341 __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
343 xmm5.
int_vec = _mm256_setzero_si256();
344 xmm4.
int_vec = _mm256_setzero_si256();
345 holderf.
int_vec = _mm256_setzero_si256();
346 holderi.
int_vec = _mm256_setzero_si256();
348 int bound = num_bytes >> 6;
351 xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
352 xmm9 = _mm256_setzero_si256();
353 xmm10 = _mm256_set1_epi32(8);
354 xmm3 = _mm256_setzero_ps();
356 __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
357 for (;
i < bound; ++
i) {
358 xmm1 = _mm256_loadu_ps((
float*)src0);
359 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
363 xmm1 = _mm256_mul_ps(xmm1, xmm1);
364 xmm2 = _mm256_mul_ps(xmm2, xmm2);
366 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
367 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
369 xmm3 = _mm256_max_ps(xmm1, xmm3);
371 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
372 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
374 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
375 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
377 xmm9 = _mm256_add_epi32(xmm11, xmm12);
379 xmm8 = _mm256_add_epi32(xmm8, xmm10);
382 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
383 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
385 for (
i = 0;
i < 8;
i++) {
386 if (holderf.
f[
i] > max) {
387 index = holderi.
i[
i];
392 for (
i = bound * 8;
i < num_points;
i++, src0++) {
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
Definition: volk_common.h:128
float f[8]
Definition: volk_common.h:132
uint32_t i[8]
Definition: volk_common.h:131
__m256 float_vec
Definition: volk_common.h:136
__m256i int_vec
Definition: volk_common.h:137
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:172
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:287
#define bit128_p(x)
Definition: volk_common.h:142
#define lv_cimag(x)
Definition: volk_complex.h:94
#define lv_creal(x)
Definition: volk_complex.h:92
float complex lv_32fc_t
Definition: volk_complex.h:70
for i
Definition: volk_config_fixed.tmpl.h:25