71 #ifndef INCLUDED_volk_32f_x2_max_32f_a_H
72 #define INCLUDED_volk_32f_x2_max_32f_a_H
77 #ifdef LV_HAVE_AVX512F
78 #include <immintrin.h>
80 static inline void volk_32f_x2_max_32f_a_avx512f(
float* cVector,
83 unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
92 __m512 aVal, bVal, cVal;
93 for (; number < sixteenthPoints; number++) {
94 aVal = _mm512_load_ps(aPtr);
95 bVal = _mm512_load_ps(bPtr);
97 cVal = _mm512_max_ps(aVal, bVal);
99 _mm512_store_ps(cPtr, cVal);
106 number = sixteenthPoints * 16;
107 for (; number < num_points; number++) {
108 const float a = *aPtr++;
109 const float b = *bPtr++;
110 *cPtr++ = (a > b ? a : b);
116 #include <xmmintrin.h>
119 const float* aVector,
120 const float* bVector,
121 unsigned int num_points)
123 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
126 float* cPtr = cVector;
127 const float* aPtr = aVector;
128 const float* bPtr = bVector;
130 __m128 aVal, bVal, cVal;
131 for (; number < quarterPoints; number++) {
132 aVal = _mm_load_ps(aPtr);
133 bVal = _mm_load_ps(bPtr);
135 cVal = _mm_max_ps(aVal, bVal);
137 _mm_store_ps(cPtr, cVal);
144 number = quarterPoints * 4;
145 for (; number < num_points; number++) {
146 const float a = *aPtr++;
147 const float b = *bPtr++;
148 *cPtr++ = (a > b ? a : b);
154 #include <immintrin.h>
157 const float* aVector,
158 const float* bVector,
159 unsigned int num_points)
161 unsigned int number = 0;
162 const unsigned int eighthPoints = num_points / 8;
164 float* cPtr = cVector;
165 const float* aPtr = aVector;
166 const float* bPtr = bVector;
168 __m256 aVal, bVal, cVal;
169 for (; number < eighthPoints; number++) {
170 aVal = _mm256_load_ps(aPtr);
171 bVal = _mm256_load_ps(bPtr);
173 cVal = _mm256_max_ps(aVal, bVal);
175 _mm256_store_ps(cPtr, cVal);
182 number = eighthPoints * 8;
183 for (; number < num_points; number++) {
184 const float a = *aPtr++;
185 const float b = *bPtr++;
186 *cPtr++ = (a > b ? a : b);
192 #include <arm_neon.h>
195 const float* aVector,
196 const float* bVector,
197 unsigned int num_points)
199 unsigned int quarter_points = num_points / 4;
200 float* cPtr = cVector;
201 const float* aPtr = aVector;
202 const float* bPtr = bVector;
203 unsigned int number = 0;
205 float32x4_t a_vec, b_vec, c_vec;
206 for (number = 0; number < quarter_points; number++) {
207 a_vec = vld1q_f32(aPtr);
208 b_vec = vld1q_f32(bPtr);
209 c_vec = vmaxq_f32(a_vec, b_vec);
210 vst1q_f32(cPtr, c_vec);
216 for (number = quarter_points * 4; number < num_points; number++) {
217 const float a = *aPtr++;
218 const float b = *bPtr++;
219 *cPtr++ = (a > b ? a : b);
225 #ifdef LV_HAVE_GENERIC
228 const float* aVector,
229 const float* bVector,
230 unsigned int num_points)
232 float* cPtr = cVector;
233 const float* aPtr = aVector;
234 const float* bPtr = bVector;
235 unsigned int number = 0;
237 for (number = 0; number < num_points; number++) {
238 const float a = *aPtr++;
239 const float b = *bPtr++;
240 *cPtr++ = (a > b ? a : b);
246 extern void volk_32f_x2_max_32f_a_orc_impl(
float* cVector,
247 const float* aVector,
248 const float* bVector,
249 unsigned int num_points);
251 static inline void volk_32f_x2_max_32f_u_orc(
float* cVector,
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
256 volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
264 #ifndef INCLUDED_volk_32f_x2_max_32f_u_H
265 #define INCLUDED_volk_32f_x2_max_32f_u_H
267 #include <inttypes.h>
270 #ifdef LV_HAVE_AVX512F
271 #include <immintrin.h>
273 static inline void volk_32f_x2_max_32f_u_avx512f(
float* cVector,
274 const float* aVector,
275 const float* bVector,
276 unsigned int num_points)
278 unsigned int number = 0;
279 const unsigned int sixteenthPoints = num_points / 16;
281 float* cPtr = cVector;
282 const float* aPtr = aVector;
283 const float* bPtr = bVector;
285 __m512 aVal, bVal, cVal;
286 for (; number < sixteenthPoints; number++) {
287 aVal = _mm512_loadu_ps(aPtr);
288 bVal = _mm512_loadu_ps(bPtr);
290 cVal = _mm512_max_ps(aVal, bVal);
292 _mm512_storeu_ps(cPtr, cVal);
299 number = sixteenthPoints * 16;
300 for (; number < num_points; number++) {
301 const float a = *aPtr++;
302 const float b = *bPtr++;
303 *cPtr++ = (a > b ? a : b);
309 #include <immintrin.h>
312 const float* aVector,
313 const float* bVector,
314 unsigned int num_points)
316 unsigned int number = 0;
317 const unsigned int eighthPoints = num_points / 8;
319 float* cPtr = cVector;
320 const float* aPtr = aVector;
321 const float* bPtr = bVector;
323 __m256 aVal, bVal, cVal;
324 for (; number < eighthPoints; number++) {
325 aVal = _mm256_loadu_ps(aPtr);
326 bVal = _mm256_loadu_ps(bPtr);
328 cVal = _mm256_max_ps(aVal, bVal);
330 _mm256_storeu_ps(cPtr, cVal);
337 number = eighthPoints * 8;
338 for (; number < num_points; number++) {
339 const float a = *aPtr++;
340 const float b = *bPtr++;
341 *cPtr++ = (a > b ? a : b);
static void volk_32f_x2_max_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:311
static void volk_32f_x2_max_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:156
static void volk_32f_x2_max_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:194
static void volk_32f_x2_max_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:227
static void volk_32f_x2_max_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:118