66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
73 #include <immintrin.h>
74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap,
unsigned int num_points)
79 const unsigned int nPerSet = 8;
80 const uint64_t nSets = num_points / nPerSet;
82 uint32_t* inputPtr = intsToSwap;
84 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
85 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
86 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
88 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
90 for (number = 0; number < nSets; number++) {
93 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
94 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
97 _mm256_storeu_si256((__m256i*)inputPtr, output);
103 for (number = nSets * nPerSet; number < num_points; number++) {
104 uint32_t outputVal = *inputPtr;
105 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
106 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
107 *inputPtr = outputVal;
115 #include <emmintrin.h>
119 unsigned int number = 0;
121 uint32_t* inputPtr = intsToSwap;
122 __m128i input, byte1, byte2, byte3, byte4, output;
123 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
124 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
126 const uint64_t quarterPoints = num_points / 4;
127 for (; number < quarterPoints; number++) {
129 input = _mm_loadu_si128((__m128i*)inputPtr);
131 byte1 = _mm_slli_epi32(input, 24);
132 byte2 = _mm_slli_epi32(input, 8);
133 byte3 = _mm_srli_epi32(input, 8);
134 byte4 = _mm_srli_epi32(input, 24);
136 output = _mm_or_si128(byte1, byte4);
137 byte2 = _mm_and_si128(byte2, byte2mask);
138 output = _mm_or_si128(output, byte2);
139 byte3 = _mm_and_si128(byte3, byte3mask);
140 output = _mm_or_si128(output, byte3);
142 _mm_storeu_si128((__m128i*)inputPtr, output);
147 number = quarterPoints * 4;
148 for (; number < num_points; number++) {
149 uint32_t outputVal = *inputPtr;
150 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
151 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
152 *inputPtr = outputVal;
160 #include <arm_neon.h>
164 uint32_t* inputPtr = intsToSwap;
165 unsigned int number = 0;
166 unsigned int n8points = num_points / 8;
168 uint8x8x4_t input_table;
169 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
170 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
180 int_lookup01 = vcreate_u8(74609667900706840);
181 int_lookup23 = vcreate_u8(219290013576860186);
182 int_lookup45 = vcreate_u8(363970359253013532);
183 int_lookup67 = vcreate_u8(508650704929166878);
185 for (number = 0; number < n8points; ++number) {
186 input_table = vld4_u8((uint8_t*)inputPtr);
187 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
188 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
189 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
190 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
191 vst1_u8((uint8_t*)inputPtr, swapped_int01);
192 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
193 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
194 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
199 for (number = n8points * 8; number < num_points; ++number) {
200 uint32_t output = *inputPtr;
201 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
202 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
210 #ifdef LV_HAVE_NEONV8
211 #include <arm_neon.h>
213 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap,
unsigned int num_points)
215 uint32_t* inputPtr = (uint32_t*)intsToSwap;
216 const unsigned int n8points = num_points / 8;
218 uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
220 unsigned int number = 0;
221 for (number = 0; number < n8points; ++number) {
223 input = vld1q_u8((uint8_t*)inputPtr);
224 input = vqtbl1q_u8(input, idx);
225 vst1q_u8((uint8_t*)inputPtr, input);
228 input = vld1q_u8((uint8_t*)inputPtr);
229 input = vqtbl1q_u8(input, idx);
230 vst1q_u8((uint8_t*)inputPtr, input);
234 for (number = n8points * 8; number < num_points; ++number) {
235 uint32_t output = *inputPtr;
237 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
238 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
240 *inputPtr++ = output;
246 #ifdef LV_HAVE_GENERIC
249 unsigned int num_points)
251 uint32_t* inputPtr = intsToSwap;
254 for (point = 0; point < num_points; point++) {
255 uint32_t output = *inputPtr;
256 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
257 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
267 #ifndef INCLUDED_volk_32u_byteswap_a_H
268 #define INCLUDED_volk_32u_byteswap_a_H
270 #include <inttypes.h>
275 #include <immintrin.h>
276 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap,
unsigned int num_points)
281 const unsigned int nPerSet = 8;
282 const uint64_t nSets = num_points / nPerSet;
284 uint32_t* inputPtr = intsToSwap;
286 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
287 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
288 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
290 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
292 for (number = 0; number < nSets; number++) {
295 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
296 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
299 _mm256_store_si256((__m256i*)inputPtr, output);
305 for (number = nSets * nPerSet; number < num_points; number++) {
306 uint32_t outputVal = *inputPtr;
307 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
308 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
309 *inputPtr = outputVal;
317 #include <emmintrin.h>
322 unsigned int number = 0;
324 uint32_t* inputPtr = intsToSwap;
325 __m128i input, byte1, byte2, byte3, byte4, output;
326 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
327 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
329 const uint64_t quarterPoints = num_points / 4;
330 for (; number < quarterPoints; number++) {
332 input = _mm_load_si128((__m128i*)inputPtr);
334 byte1 = _mm_slli_epi32(input, 24);
335 byte2 = _mm_slli_epi32(input, 8);
336 byte3 = _mm_srli_epi32(input, 8);
337 byte4 = _mm_srli_epi32(input, 24);
339 output = _mm_or_si128(byte1, byte4);
340 byte2 = _mm_and_si128(byte2, byte2mask);
341 output = _mm_or_si128(output, byte2);
342 byte3 = _mm_and_si128(byte3, byte3mask);
343 output = _mm_or_si128(output, byte3);
345 _mm_store_si128((__m128i*)inputPtr, output);
350 number = quarterPoints * 4;
351 for (; number < num_points; number++) {
352 uint32_t outputVal = *inputPtr;
353 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
354 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
355 *inputPtr = outputVal;
362 #ifdef LV_HAVE_GENERIC
365 unsigned int num_points)
367 uint32_t* inputPtr = intsToSwap;
370 for (point = 0; point < num_points; point++) {
371 uint32_t output = *inputPtr;
372 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
373 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:162
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:248
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:364
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:117
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:320
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62