Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
80 
81 #include <inttypes.h>
82 #include <stdio.h>
83 #include <volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include <immintrin.h>
87 
88 static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
89  lv_32fc_t* src0,
90  lv_32fc_t* points,
91  unsigned int num_points)
92 {
93  const unsigned int num_bytes = num_points * 8;
94  __m128 xmm0, xmm9, xmm10;
95  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
96 
97  lv_32fc_t diff;
98  float sq_dist;
99  int bound = num_bytes >> 6;
100  int leftovers0 = (num_bytes >> 5) & 1;
101  int leftovers1 = (num_bytes >> 4) & 1;
102  int leftovers2 = (num_bytes >> 3) & 1;
103  int i = 0;
104 
105  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
106  xmm1 = _mm256_setzero_ps();
107  xmm2 = _mm256_load_ps((float*)&points[0]);
108  xmm0 = _mm_load_ps((float*)src0);
109  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
110  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
111  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
112  xmm3 = _mm256_load_ps((float*)&points[4]);
113 
114  for (; i < bound; ++i) {
115  xmm4 = _mm256_sub_ps(xmm1, xmm2);
116  xmm5 = _mm256_sub_ps(xmm1, xmm3);
117  points += 8;
118  xmm6 = _mm256_mul_ps(xmm4, xmm4);
119  xmm7 = _mm256_mul_ps(xmm5, xmm5);
120 
121  xmm2 = _mm256_load_ps((float*)&points[0]);
122 
123  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
124  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
125 
126  xmm3 = _mm256_load_ps((float*)&points[4]);
127 
128  _mm256_store_ps(target, xmm4);
129 
130  target += 8;
131  }
132 
133  for (i = 0; i < leftovers0; ++i) {
134 
135  xmm2 = _mm256_load_ps((float*)&points[0]);
136 
137  xmm4 = _mm256_sub_ps(xmm1, xmm2);
138 
139  points += 4;
140 
141  xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 
143  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
144  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 
146  xmm9 = _mm256_extractf128_ps(xmm4, 1);
147  _mm_store_ps(target, xmm9);
148 
149  target += 4;
150  }
151 
152  for (i = 0; i < leftovers1; ++i) {
153  xmm9 = _mm_load_ps((float*)&points[0]);
154 
155  xmm10 = _mm_sub_ps(xmm0, xmm9);
156 
157  points += 2;
158 
159  xmm9 = _mm_mul_ps(xmm10, xmm10);
160 
161  xmm10 = _mm_hadd_ps(xmm9, xmm9);
162 
163  _mm_storeh_pi((__m64*)target, xmm10);
164 
165  target += 2;
166  }
167 
168  for (i = 0; i < leftovers2; ++i) {
169 
170  diff = src0[0] - points[0];
171 
172  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
173 
174  target[0] = sq_dist;
175  }
176 }
177 
178 #endif /*LV_HAVE_AVX2*/
179 
180 #ifdef LV_HAVE_SSE3
181 #include <pmmintrin.h>
182 #include <xmmintrin.h>
183 
184 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
185  lv_32fc_t* src0,
186  lv_32fc_t* points,
187  unsigned int num_points)
188 {
189  const unsigned int num_bytes = num_points * 8;
190 
191  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
192 
193  lv_32fc_t diff;
194  float sq_dist;
195  int bound = num_bytes >> 5;
196  int i = 0;
197 
198  xmm1 = _mm_setzero_ps();
199  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
200  xmm2 = _mm_load_ps((float*)&points[0]);
201  xmm1 = _mm_movelh_ps(xmm1, xmm1);
202  xmm3 = _mm_load_ps((float*)&points[2]);
203 
204  for (; i < bound - 1; ++i) {
205  xmm4 = _mm_sub_ps(xmm1, xmm2);
206  xmm5 = _mm_sub_ps(xmm1, xmm3);
207  points += 4;
208  xmm6 = _mm_mul_ps(xmm4, xmm4);
209  xmm7 = _mm_mul_ps(xmm5, xmm5);
210 
211  xmm2 = _mm_load_ps((float*)&points[0]);
212 
213  xmm4 = _mm_hadd_ps(xmm6, xmm7);
214 
215  xmm3 = _mm_load_ps((float*)&points[2]);
216 
217  _mm_store_ps(target, xmm4);
218 
219  target += 4;
220  }
221 
222  xmm4 = _mm_sub_ps(xmm1, xmm2);
223  xmm5 = _mm_sub_ps(xmm1, xmm3);
224 
225  points += 4;
226  xmm6 = _mm_mul_ps(xmm4, xmm4);
227  xmm7 = _mm_mul_ps(xmm5, xmm5);
228 
229  xmm4 = _mm_hadd_ps(xmm6, xmm7);
230 
231  _mm_store_ps(target, xmm4);
232 
233  target += 4;
234 
235  if (num_bytes >> 4 & 1) {
236 
237  xmm2 = _mm_load_ps((float*)&points[0]);
238 
239  xmm4 = _mm_sub_ps(xmm1, xmm2);
240 
241  points += 2;
242 
243  xmm6 = _mm_mul_ps(xmm4, xmm4);
244 
245  xmm4 = _mm_hadd_ps(xmm6, xmm6);
246 
247  _mm_storeh_pi((__m64*)target, xmm4);
248 
249  target += 2;
250  }
251 
252  if (num_bytes >> 3 & 1) {
253 
254  diff = src0[0] - points[0];
255 
256  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
257 
258  target[0] = sq_dist;
259  }
260 }
261 
262 #endif /*LV_HAVE_SSE3*/
263 
264 
265 #ifdef LV_HAVE_NEON
266 #include <arm_neon.h>
267 static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
268  lv_32fc_t* src0,
269  lv_32fc_t* points,
270  unsigned int num_points)
271 {
272  const unsigned int quarter_points = num_points / 4;
273  unsigned int number;
274 
275  float32x4x2_t a_vec, b_vec;
276  float32x4x2_t diff_vec;
277  float32x4_t tmp, tmp1, dist_sq;
278  a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
279  a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
280  for (number = 0; number < quarter_points; ++number) {
281  b_vec = vld2q_f32((float*)points);
282  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
283  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
284  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
285  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
286 
287  dist_sq = vaddq_f32(tmp, tmp1);
288  vst1q_f32(target, dist_sq);
289  points += 4;
290  target += 4;
291  }
292  for (number = quarter_points * 4; number < num_points; ++number) {
293  lv_32fc_t diff = src0[0] - *points++;
294  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
295  }
296 }
297 #endif /* LV_HAVE_NEON */
298 
299 
300 #ifdef LV_HAVE_GENERIC
301 static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
302  lv_32fc_t* src0,
303  lv_32fc_t* points,
304  unsigned int num_points)
305 {
306  const unsigned int num_bytes = num_points * 8;
307 
308  lv_32fc_t diff;
309  float sq_dist;
310  unsigned int i = 0;
311 
312  for (; i<num_bytes>> 3; ++i) {
313  diff = src0[0] - points[i];
314 
315  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
316 
317  target[i] = sq_dist;
318  }
319 }
320 
321 #endif /*LV_HAVE_GENERIC*/
322 
323 
324 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
325 
326 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
327 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
328 
329 #include <inttypes.h>
330 #include <stdio.h>
331 #include <volk/volk_complex.h>
332 
333 #ifdef LV_HAVE_AVX2
334 #include <immintrin.h>
335 
336 static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
337  lv_32fc_t* src0,
338  lv_32fc_t* points,
339  unsigned int num_points)
340 {
341  const unsigned int num_bytes = num_points * 8;
342  __m128 xmm0, xmm9;
343  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
344 
345  lv_32fc_t diff;
346  float sq_dist;
347  int bound = num_bytes >> 6;
348  int leftovers1 = (num_bytes >> 3) & 0b11;
349  int i = 0;
350 
351  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
352  xmm1 = _mm256_setzero_ps();
353  xmm2 = _mm256_loadu_ps((float*)&points[0]);
354  xmm0 = _mm_loadu_ps((float*)src0);
355  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
356  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
357  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
358  xmm3 = _mm256_loadu_ps((float*)&points[4]);
359 
360  for (; i < bound; ++i) {
361  xmm4 = _mm256_sub_ps(xmm1, xmm2);
362  xmm5 = _mm256_sub_ps(xmm1, xmm3);
363  points += 8;
364  xmm6 = _mm256_mul_ps(xmm4, xmm4);
365  xmm7 = _mm256_mul_ps(xmm5, xmm5);
366 
367  xmm2 = _mm256_loadu_ps((float*)&points[0]);
368 
369  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
370  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
371 
372  xmm3 = _mm256_loadu_ps((float*)&points[4]);
373 
374  _mm256_storeu_ps(target, xmm4);
375 
376  target += 8;
377  }
378 
379  if (num_bytes >> 5 & 1) {
380 
381  xmm2 = _mm256_loadu_ps((float*)&points[0]);
382 
383  xmm4 = _mm256_sub_ps(xmm1, xmm2);
384 
385  points += 4;
386 
387  xmm6 = _mm256_mul_ps(xmm4, xmm4);
388 
389  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
390  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
391 
392  xmm9 = _mm256_extractf128_ps(xmm4, 1);
393  _mm_storeu_ps(target, xmm9);
394 
395  target += 4;
396  }
397 
398  for (i = 0; i < leftovers1; ++i) {
399 
400  diff = src0[0] - points[0];
401  points += 1;
402 
403  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
404 
405  target[0] = sq_dist;
406  target += 1;
407  }
408 }
409 
410 #endif /*LV_HAVE_AVX2*/
411 
412 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:301
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:267
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
#define lv_cimag(x)
Definition: volk_complex.h:94
#define lv_creal(x)
Definition: volk_complex.h:92
float complex lv_32fc_t
Definition: volk_complex.h:70
for i
Definition: volk_config_fixed.tmpl.h:25