66 #ifndef INCLUDED_volk_64u_byteswap_u_H
67 #define INCLUDED_volk_64u_byteswap_u_H
73 #include <emmintrin.h>
77 uint32_t* inputPtr = (uint32_t*)intsToSwap;
78 __m128i input, byte1, byte2, byte3, byte4, output;
79 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
80 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
82 const unsigned int halfPoints = num_points / 2;
83 for (; number < halfPoints; number++) {
85 input = _mm_loadu_si128((__m128i*)inputPtr);
88 byte1 = _mm_slli_epi32(input, 24);
89 byte2 = _mm_slli_epi32(input, 8);
90 byte3 = _mm_srli_epi32(input, 8);
91 byte4 = _mm_srli_epi32(input, 24);
93 output = _mm_or_si128(byte1, byte4);
94 byte2 = _mm_and_si128(byte2, byte2mask);
95 output = _mm_or_si128(output, byte2);
96 byte3 = _mm_and_si128(byte3, byte3mask);
97 output = _mm_or_si128(output, byte3);
100 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
103 _mm_storeu_si128((__m128i*)inputPtr, output);
108 number = halfPoints * 2;
109 for (; number < num_points; number++) {
110 uint32_t output1 = *inputPtr;
111 uint32_t output2 = inputPtr[1];
113 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
114 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
116 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
117 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
119 *inputPtr++ = output2;
120 *inputPtr++ = output1;
126 #ifdef LV_HAVE_GENERIC
129 unsigned int num_points)
131 uint32_t* inputPtr = (uint32_t*)intsToSwap;
133 for (point = 0; point < num_points; point++) {
134 uint32_t output1 = *inputPtr;
135 uint32_t output2 = inputPtr[1];
137 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
138 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
140 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
141 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
143 *inputPtr++ = output2;
144 *inputPtr++ = output1;
150 #include <immintrin.h>
151 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap,
unsigned int num_points)
153 unsigned int number = 0;
155 const unsigned int nPerSet = 4;
156 const uint64_t nSets = num_points / nPerSet;
158 uint32_t* inputPtr = (uint32_t*)intsToSwap;
160 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
161 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
162 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
164 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
166 for (; number < nSets; number++) {
169 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
170 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
173 _mm256_store_si256((__m256i*)inputPtr, output);
176 inputPtr += 2 * nPerSet;
181 for (number = nSets * nPerSet; number < num_points; ++number) {
182 uint32_t output1 = *inputPtr;
183 uint32_t output2 = inputPtr[1];
185 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
186 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
189 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
190 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
200 #include <tmmintrin.h>
202 unsigned int num_points)
204 unsigned int number = 0;
206 const unsigned int nPerSet = 2;
207 const uint64_t nSets = num_points / nPerSet;
209 uint32_t* inputPtr = (uint32_t*)intsToSwap;
211 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
213 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
215 for (; number < nSets; number++) {
218 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
219 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
222 _mm_store_si128((__m128i*)inputPtr, output);
225 inputPtr += 2 * nPerSet;
229 for (number = nSets * nPerSet; number < num_points; ++number) {
230 uint32_t output1 = *inputPtr;
231 uint32_t output2 = inputPtr[1];
233 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
234 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
237 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
238 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
246 #ifdef LV_HAVE_NEONV8
247 #include <arm_neon.h>
249 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap,
unsigned int num_points)
251 uint32_t* inputPtr = (uint32_t*)intsToSwap;
252 const unsigned int n4points = num_points / 4;
254 uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
256 unsigned int number = 0;
257 for (number = 0; number < n4points; ++number) {
259 input = vld2q_u8((uint8_t*)inputPtr);
260 input.val[0] = vqtbl1q_u8(input.val[0], idx);
261 input.val[1] = vqtbl1q_u8(input.val[1], idx);
262 vst2q_u8((uint8_t*)inputPtr, input);
267 for (number = n4points * 4; number < num_points; ++number) {
268 uint32_t output1 = *inputPtr;
269 uint32_t output2 = inputPtr[1];
271 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
272 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
273 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
274 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
276 *inputPtr++ = output2;
277 *inputPtr++ = output1;
282 #include <arm_neon.h>
286 uint32_t* inputPtr = (uint32_t*)intsToSwap;
287 unsigned int number = 0;
288 unsigned int n8points = num_points / 4;
290 uint8x8x4_t input_table;
291 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
292 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
302 int_lookup01 = vcreate_u8(2269495096316185);
303 int_lookup23 = vcreate_u8(146949840772469531);
304 int_lookup45 = vcreate_u8(291630186448622877);
305 int_lookup67 = vcreate_u8(436310532124776223);
307 for (number = 0; number < n8points; ++number) {
308 input_table = vld4_u8((uint8_t*)inputPtr);
309 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
310 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
311 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
312 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
313 vst1_u8((uint8_t*)inputPtr, swapped_int01);
314 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
315 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
316 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
321 for (number = n8points * 4; number < num_points; ++number) {
322 uint32_t output1 = *inputPtr;
323 uint32_t output2 = inputPtr[1];
325 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
326 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
327 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
328 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
330 *inputPtr++ = output2;
331 *inputPtr++ = output1;
338 #ifndef INCLUDED_volk_64u_byteswap_a_H
339 #define INCLUDED_volk_64u_byteswap_a_H
341 #include <inttypes.h>
346 #include <emmintrin.h>
350 uint32_t* inputPtr = (uint32_t*)intsToSwap;
351 __m128i input, byte1, byte2, byte3, byte4, output;
352 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
353 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
355 const unsigned int halfPoints = num_points / 2;
356 for (; number < halfPoints; number++) {
358 input = _mm_load_si128((__m128i*)inputPtr);
361 byte1 = _mm_slli_epi32(input, 24);
362 byte2 = _mm_slli_epi32(input, 8);
363 byte3 = _mm_srli_epi32(input, 8);
364 byte4 = _mm_srli_epi32(input, 24);
366 output = _mm_or_si128(byte1, byte4);
367 byte2 = _mm_and_si128(byte2, byte2mask);
368 output = _mm_or_si128(output, byte2);
369 byte3 = _mm_and_si128(byte3, byte3mask);
370 output = _mm_or_si128(output, byte3);
373 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
376 _mm_store_si128((__m128i*)inputPtr, output);
381 number = halfPoints * 2;
382 for (; number < num_points; number++) {
383 uint32_t output1 = *inputPtr;
384 uint32_t output2 = inputPtr[1];
386 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
387 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
389 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
390 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
392 *inputPtr++ = output2;
393 *inputPtr++ = output1;
399 #include <immintrin.h>
400 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap,
unsigned int num_points)
402 unsigned int number = 0;
404 const unsigned int nPerSet = 4;
405 const uint64_t nSets = num_points / nPerSet;
407 uint32_t* inputPtr = (uint32_t*)intsToSwap;
409 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
410 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
411 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
413 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
415 for (; number < nSets; number++) {
417 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
418 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
421 _mm256_storeu_si256((__m256i*)inputPtr, output);
424 inputPtr += 2 * nPerSet;
429 for (number = nSets * nPerSet; number < num_points; ++number) {
430 uint32_t output1 = *inputPtr;
431 uint32_t output2 = inputPtr[1];
433 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
434 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
437 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
438 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
448 #include <tmmintrin.h>
450 unsigned int num_points)
452 unsigned int number = 0;
454 const unsigned int nPerSet = 2;
455 const uint64_t nSets = num_points / nPerSet;
457 uint32_t* inputPtr = (uint32_t*)intsToSwap;
459 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
461 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
463 for (; number < nSets; number++) {
465 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
466 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
469 _mm_storeu_si128((__m128i*)inputPtr, output);
472 inputPtr += 2 * nPerSet;
476 for (number = nSets * nPerSet; number < num_points; ++number) {
477 uint32_t output1 = *inputPtr;
478 uint32_t output2 = inputPtr[1];
480 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
481 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
484 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
485 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
492 #ifdef LV_HAVE_GENERIC
495 unsigned int num_points)
497 uint32_t* inputPtr = (uint32_t*)intsToSwap;
499 for (point = 0; point < num_points; point++) {
500 uint32_t output1 = *inputPtr;
501 uint32_t output2 = inputPtr[1];
503 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
504 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
506 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
507 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
509 *inputPtr++ = output2;
510 *inputPtr++ = output1;