Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77 #define INCLUDED_volk_32fc_index_max_16u_a_H
78 
79 #include <inttypes.h>
80 #include <limits.h>
81 #include <stdio.h>
82 #include <volk/volk_common.h>
83 #include <volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include <immintrin.h>
87 
88 static inline void
89 volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
90 {
91  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
92  const uint32_t num_bytes = num_points * 8;
93 
94  union bit256 holderf;
95  union bit256 holderi;
96  float sq_dist = 0.0;
97  float max = 0.0;
98  uint16_t index = 0;
99 
100  union bit256 xmm5, xmm4;
101  __m256 xmm1, xmm2, xmm3;
102  __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
103 
104  xmm5.int_vec = _mm256_setzero_si256();
105  xmm4.int_vec = _mm256_setzero_si256();
106  holderf.int_vec = _mm256_setzero_si256();
107  holderi.int_vec = _mm256_setzero_si256();
108 
109  int bound = num_bytes >> 6;
110  int i = 0;
111 
112  xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
113  xmm9 = _mm256_setzero_si256();
114  xmm10 = _mm256_set1_epi32(8);
115  xmm3 = _mm256_setzero_ps();
116 
117  __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
118  for (; i < bound; ++i) {
119  xmm1 = _mm256_load_ps((float*)src0);
120  xmm2 = _mm256_load_ps((float*)&src0[4]);
121 
122  src0 += 8;
123 
124  xmm1 = _mm256_mul_ps(xmm1, xmm1);
125  xmm2 = _mm256_mul_ps(xmm2, xmm2);
126 
127  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
128  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
129 
130  xmm3 = _mm256_max_ps(xmm1, xmm3);
131 
132  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
133  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
134 
135  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
136  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
137 
138  xmm9 = _mm256_add_epi32(xmm11, xmm12);
139 
140  xmm8 = _mm256_add_epi32(xmm8, xmm10);
141  }
142 
143  _mm256_store_ps((float*)&(holderf.f), xmm3);
144  _mm256_store_si256(&(holderi.int_vec), xmm9);
145 
146  for (i = 0; i < 8; i++) {
147  if (holderf.f[i] > max) {
148  index = holderi.i[i];
149  max = holderf.f[i];
150  }
151  }
152 
153  for (i = bound * 8; i < num_points; i++, src0++) {
154  sq_dist =
155  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
156 
157  if (sq_dist > max) {
158  index = i;
159  max = sq_dist;
160  }
161  }
162  target[0] = index;
163 }
164 
165 #endif /*LV_HAVE_AVX2*/
166 
167 #ifdef LV_HAVE_SSE3
168 #include <pmmintrin.h>
169 #include <xmmintrin.h>
170 
171 static inline void
172 volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
173 {
174  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
175  const uint32_t num_bytes = num_points * 8;
176 
177  union bit128 holderf;
178  union bit128 holderi;
179  float sq_dist = 0.0;
180 
181  union bit128 xmm5, xmm4;
182  __m128 xmm1, xmm2, xmm3;
183  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
184 
185  xmm5.int_vec = _mm_setzero_si128();
186  xmm4.int_vec = _mm_setzero_si128();
187  holderf.int_vec = _mm_setzero_si128();
188  holderi.int_vec = _mm_setzero_si128();
189 
190  int bound = num_bytes >> 5;
191  int i = 0;
192 
193  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
194  xmm9 = _mm_setzero_si128();
195  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
196  xmm3 = _mm_setzero_ps();
197 
198  for (; i < bound; ++i) {
199  xmm1 = _mm_load_ps((float*)src0);
200  xmm2 = _mm_load_ps((float*)&src0[2]);
201 
202  src0 += 4;
203 
204  xmm1 = _mm_mul_ps(xmm1, xmm1);
205  xmm2 = _mm_mul_ps(xmm2, xmm2);
206 
207  xmm1 = _mm_hadd_ps(xmm1, xmm2);
208 
209  xmm3 = _mm_max_ps(xmm1, xmm3);
210 
211  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
212  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
213 
214  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
215  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
216 
217  xmm9 = _mm_add_epi32(xmm11, xmm12);
218 
219  xmm8 = _mm_add_epi32(xmm8, xmm10);
220  }
221 
222  if (num_bytes >> 4 & 1) {
223  xmm2 = _mm_load_ps((float*)src0);
224 
225  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
226  xmm8 = bit128_p(&xmm1)->int_vec;
227 
228  xmm2 = _mm_mul_ps(xmm2, xmm2);
229 
230  src0 += 2;
231 
232  xmm1 = _mm_hadd_ps(xmm2, xmm2);
233 
234  xmm3 = _mm_max_ps(xmm1, xmm3);
235 
236  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
237 
238  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
239  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
240 
241  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
242  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
243 
244  xmm9 = _mm_add_epi32(xmm11, xmm12);
245 
246  xmm8 = _mm_add_epi32(xmm8, xmm10);
247  }
248 
249  if (num_bytes >> 3 & 1) {
250  sq_dist =
251  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
252 
253  xmm2 = _mm_load1_ps(&sq_dist);
254 
255  xmm1 = xmm3;
256 
257  xmm3 = _mm_max_ss(xmm3, xmm2);
258 
259  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
260  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
261 
262  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
263 
264  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
265  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
266 
267  xmm9 = _mm_add_epi32(xmm11, xmm12);
268  }
269 
270  _mm_store_ps((float*)&(holderf.f), xmm3);
271  _mm_store_si128(&(holderi.int_vec), xmm9);
272 
273  target[0] = holderi.i[0];
274  sq_dist = holderf.f[0];
275  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
276  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
277  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
278  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
279  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
280  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
281 }
282 
283 #endif /*LV_HAVE_SSE3*/
284 
285 #ifdef LV_HAVE_GENERIC
286 static inline void
287 volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
288 {
289  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
290 
291  const uint32_t num_bytes = num_points * 8;
292 
293  float sq_dist = 0.0;
294  float max = 0.0;
295  uint16_t index = 0;
296 
297  uint32_t i = 0;
298 
299  for (; i<num_bytes>> 3; ++i) {
300  sq_dist =
301  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
302 
303  if (sq_dist > max) {
304  index = i;
305  max = sq_dist;
306  }
307  }
308  target[0] = index;
309 }
310 
311 #endif /*LV_HAVE_GENERIC*/
312 
313 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
314 
315 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
316 #define INCLUDED_volk_32fc_index_max_16u_u_H
317 
318 #include <inttypes.h>
319 #include <limits.h>
320 #include <stdio.h>
321 #include <volk/volk_common.h>
322 #include <volk/volk_complex.h>
323 
324 #ifdef LV_HAVE_AVX2
325 #include <immintrin.h>
326 
327 static inline void
328 volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
329 {
330  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
331  const uint32_t num_bytes = num_points * 8;
332 
333  union bit256 holderf;
334  union bit256 holderi;
335  float sq_dist = 0.0;
336  float max = 0.0;
337  uint16_t index = 0;
338 
339  union bit256 xmm5, xmm4;
340  __m256 xmm1, xmm2, xmm3;
341  __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
342 
343  xmm5.int_vec = _mm256_setzero_si256();
344  xmm4.int_vec = _mm256_setzero_si256();
345  holderf.int_vec = _mm256_setzero_si256();
346  holderi.int_vec = _mm256_setzero_si256();
347 
348  int bound = num_bytes >> 6;
349  int i = 0;
350 
351  xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
352  xmm9 = _mm256_setzero_si256();
353  xmm10 = _mm256_set1_epi32(8);
354  xmm3 = _mm256_setzero_ps();
355 
356  __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
357  for (; i < bound; ++i) {
358  xmm1 = _mm256_loadu_ps((float*)src0);
359  xmm2 = _mm256_loadu_ps((float*)&src0[4]);
360 
361  src0 += 8;
362 
363  xmm1 = _mm256_mul_ps(xmm1, xmm1);
364  xmm2 = _mm256_mul_ps(xmm2, xmm2);
365 
366  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
367  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
368 
369  xmm3 = _mm256_max_ps(xmm1, xmm3);
370 
371  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
372  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
373 
374  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
375  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
376 
377  xmm9 = _mm256_add_epi32(xmm11, xmm12);
378 
379  xmm8 = _mm256_add_epi32(xmm8, xmm10);
380  }
381 
382  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
383  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
384 
385  for (i = 0; i < 8; i++) {
386  if (holderf.f[i] > max) {
387  index = holderi.i[i];
388  max = holderf.f[i];
389  }
390  }
391 
392  for (i = bound * 8; i < num_points; i++, src0++) {
393  sq_dist =
394  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
395 
396  if (sq_dist > max) {
397  index = i;
398  max = sq_dist;
399  }
400  }
401  target[0] = index;
402 }
403 
404 #endif /*LV_HAVE_AVX2*/
405 
406 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
Definition: volk_common.h:128
float f[8]
Definition: volk_common.h:132
uint32_t i[8]
Definition: volk_common.h:131
__m256 float_vec
Definition: volk_common.h:136
__m256i int_vec
Definition: volk_common.h:137
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:172
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:287
#define bit128_p(x)
Definition: volk_common.h:142
#define lv_cimag(x)
Definition: volk_complex.h:94
#define lv_creal(x)
Definition: volk_complex.h:92
float complex lv_32fc_t
Definition: volk_complex.h:70
for i
Definition: volk_config_fixed.tmpl.h:25