Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
81 #ifndef INCLUDED_volk_32i_x2_or_32i_a_H
82 #define INCLUDED_volk_32i_x2_or_32i_a_H
83 
84 #include <inttypes.h>
85 #include <stdio.h>
86 
87 #ifdef LV_HAVE_AVX512F
88 #include <immintrin.h>
89 
90 static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
91  const int32_t* aVector,
92  const int32_t* bVector,
93  unsigned int num_points)
94 {
95  unsigned int number = 0;
96  const unsigned int sixteenthPoints = num_points / 16;
97 
98  int32_t* cPtr = (int32_t*)cVector;
99  const int32_t* aPtr = (int32_t*)aVector;
100  const int32_t* bPtr = (int32_t*)bVector;
101 
102  __m512i aVal, bVal, cVal;
103  for (; number < sixteenthPoints; number++) {
104 
105  aVal = _mm512_load_si512(aPtr);
106  bVal = _mm512_load_si512(bPtr);
107 
108  cVal = _mm512_or_si512(aVal, bVal);
109 
110  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
111 
112  aPtr += 16;
113  bPtr += 16;
114  cPtr += 16;
115  }
116 
117  number = sixteenthPoints * 16;
118  for (; number < num_points; number++) {
119  cVector[number] = aVector[number] | bVector[number];
120  }
121 }
122 #endif /* LV_HAVE_AVX512F */
123 
124 #ifdef LV_HAVE_AVX2
125 #include <immintrin.h>
126 
127 static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
128  const int32_t* aVector,
129  const int32_t* bVector,
130  unsigned int num_points)
131 {
132  unsigned int number = 0;
133  const unsigned int oneEightPoints = num_points / 8;
134 
135  int32_t* cPtr = cVector;
136  const int32_t* aPtr = aVector;
137  const int32_t* bPtr = bVector;
138 
139  __m256i aVal, bVal, cVal;
140  for (; number < oneEightPoints; number++) {
141 
142  aVal = _mm256_load_si256((__m256i*)aPtr);
143  bVal = _mm256_load_si256((__m256i*)bPtr);
144 
145  cVal = _mm256_or_si256(aVal, bVal);
146 
147  _mm256_store_si256((__m256i*)cPtr,
148  cVal); // Store the results back into the C container
149 
150  aPtr += 8;
151  bPtr += 8;
152  cPtr += 8;
153  }
154 
155  number = oneEightPoints * 8;
156  for (; number < num_points; number++) {
157  cVector[number] = aVector[number] | bVector[number];
158  }
159 }
160 #endif /* LV_HAVE_AVX2 */
161 
162 
163 #ifdef LV_HAVE_SSE
164 #include <xmmintrin.h>
165 
166 static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
167  const int32_t* aVector,
168  const int32_t* bVector,
169  unsigned int num_points)
170 {
171  unsigned int number = 0;
172  const unsigned int quarterPoints = num_points / 4;
173 
174  float* cPtr = (float*)cVector;
175  const float* aPtr = (float*)aVector;
176  const float* bPtr = (float*)bVector;
177 
178  __m128 aVal, bVal, cVal;
179  for (; number < quarterPoints; number++) {
180  aVal = _mm_load_ps(aPtr);
181  bVal = _mm_load_ps(bPtr);
182 
183  cVal = _mm_or_ps(aVal, bVal);
184 
185  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
186 
187  aPtr += 4;
188  bPtr += 4;
189  cPtr += 4;
190  }
191 
192  number = quarterPoints * 4;
193  for (; number < num_points; number++) {
194  cVector[number] = aVector[number] | bVector[number];
195  }
196 }
197 #endif /* LV_HAVE_SSE */
198 
199 
200 #ifdef LV_HAVE_NEON
201 #include <arm_neon.h>
202 
203 static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
204  const int32_t* aVector,
205  const int32_t* bVector,
206  unsigned int num_points)
207 {
208  int32_t* cPtr = cVector;
209  const int32_t* aPtr = aVector;
210  const int32_t* bPtr = bVector;
211  unsigned int number = 0;
212  unsigned int quarter_points = num_points / 4;
213 
214  int32x4_t a_val, b_val, c_val;
215 
216  for (number = 0; number < quarter_points; number++) {
217  a_val = vld1q_s32(aPtr);
218  b_val = vld1q_s32(bPtr);
219  c_val = vorrq_s32(a_val, b_val);
220  vst1q_s32(cPtr, c_val);
221  aPtr += 4;
222  bPtr += 4;
223  cPtr += 4;
224  }
225 
226  for (number = quarter_points * 4; number < num_points; number++) {
227  *cPtr++ = (*aPtr++) | (*bPtr++);
228  }
229 }
230 #endif /* LV_HAVE_NEON */
231 
232 
233 #ifdef LV_HAVE_GENERIC
234 
235 static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
236  const int32_t* aVector,
237  const int32_t* bVector,
238  unsigned int num_points)
239 {
240  int32_t* cPtr = cVector;
241  const int32_t* aPtr = aVector;
242  const int32_t* bPtr = bVector;
243  unsigned int number = 0;
244 
245  for (number = 0; number < num_points; number++) {
246  *cPtr++ = (*aPtr++) | (*bPtr++);
247  }
248 }
249 #endif /* LV_HAVE_GENERIC */
250 
251 
252 #ifdef LV_HAVE_ORC
253 extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
254  const int32_t* aVector,
255  const int32_t* bVector,
256  unsigned int num_points);
257 
258 static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
259  const int32_t* aVector,
260  const int32_t* bVector,
261  unsigned int num_points)
262 {
263  volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
264 }
265 #endif /* LV_HAVE_ORC */
266 
267 
268 #endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
269 
270 
271 #ifndef INCLUDED_volk_32i_x2_or_32i_u_H
272 #define INCLUDED_volk_32i_x2_or_32i_u_H
273 
274 #include <inttypes.h>
275 #include <stdio.h>
276 
277 #ifdef LV_HAVE_AVX512F
278 #include <immintrin.h>
279 
280 static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
281  const int32_t* aVector,
282  const int32_t* bVector,
283  unsigned int num_points)
284 {
285  unsigned int number = 0;
286  const unsigned int sixteenthPoints = num_points / 16;
287 
288  int32_t* cPtr = (int32_t*)cVector;
289  const int32_t* aPtr = (int32_t*)aVector;
290  const int32_t* bPtr = (int32_t*)bVector;
291 
292  __m512i aVal, bVal, cVal;
293  for (; number < sixteenthPoints; number++) {
294 
295  aVal = _mm512_loadu_si512(aPtr);
296  bVal = _mm512_loadu_si512(bPtr);
297 
298  cVal = _mm512_or_si512(aVal, bVal);
299 
300  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
301 
302  aPtr += 16;
303  bPtr += 16;
304  cPtr += 16;
305  }
306 
307  number = sixteenthPoints * 16;
308  for (; number < num_points; number++) {
309  cVector[number] = aVector[number] | bVector[number];
310  }
311 }
312 #endif /* LV_HAVE_AVX512F */
313 
314 #ifdef LV_HAVE_AVX2
315 #include <immintrin.h>
316 
317 static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
318  const int32_t* aVector,
319  const int32_t* bVector,
320  unsigned int num_points)
321 {
322  unsigned int number = 0;
323  const unsigned int oneEightPoints = num_points / 8;
324 
325  int32_t* cPtr = cVector;
326  const int32_t* aPtr = aVector;
327  const int32_t* bPtr = bVector;
328 
329  __m256i aVal, bVal, cVal;
330  for (; number < oneEightPoints; number++) {
331 
332  aVal = _mm256_loadu_si256((__m256i*)aPtr);
333  bVal = _mm256_loadu_si256((__m256i*)bPtr);
334 
335  cVal = _mm256_or_si256(aVal, bVal);
336 
337  _mm256_storeu_si256((__m256i*)cPtr,
338  cVal); // Store the results back into the C container
339 
340  aPtr += 8;
341  bPtr += 8;
342  cPtr += 8;
343  }
344 
345  number = oneEightPoints * 8;
346  for (; number < num_points; number++) {
347  cVector[number] = aVector[number] | bVector[number];
348  }
349 }
350 #endif /* LV_HAVE_AVX2 */
351 
352 
353 #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
static void volk_32i_x2_or_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:203
static void volk_32i_x2_or_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:235
static void volk_32i_x2_or_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:166