23 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
24 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
31 #include <immintrin.h>
40 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(
lv_16sc_t* cVector,
43 unsigned int num_points)
45 unsigned int number = 0;
46 const unsigned int quarterPoints = num_points / 8;
48 __m256i x, y, realz, imagz;
52 __m256i conjugateSign =
53 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
55 for (; number < quarterPoints; number++) {
57 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
58 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
61 realz = _mm256_madd_epi16(x, y);
64 y = _mm256_sign_epi16(y, conjugateSign);
67 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
68 _MM_SHUFFLE(2, 3, 0, 1));
71 imagz = _mm256_madd_epi16(x, y);
75 _mm256_store_si256((__m256i*)c,
76 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
77 _mm256_unpackhi_epi32(realz, imagz)));
84 number = quarterPoints * 8;
85 int16_t* c16Ptr = (int16_t*)&cVector[number];
86 int8_t* a8Ptr = (int8_t*)&aVector[number];
87 int8_t* b8Ptr = (int8_t*)&bVector[number];
88 for (; number < num_points; number++) {
89 float aReal = (float)*a8Ptr++;
90 float aImag = (float)*a8Ptr++;
92 float bReal = (float)*b8Ptr++;
93 float bImag = (float)*b8Ptr++;
104 #ifdef LV_HAVE_SSE4_1
105 #include <smmintrin.h>
114 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(
lv_16sc_t* cVector,
117 unsigned int num_points)
119 unsigned int number = 0;
120 const unsigned int quarterPoints = num_points / 4;
122 __m128i x, y, realz, imagz;
126 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
128 for (; number < quarterPoints; number++) {
130 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
131 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
134 realz = _mm_madd_epi16(x, y);
137 y = _mm_sign_epi16(y, conjugateSign);
140 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
141 _MM_SHUFFLE(2, 3, 0, 1));
144 imagz = _mm_madd_epi16(x, y);
146 _mm_store_si128((__m128i*)c,
147 _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
148 _mm_unpackhi_epi32(realz, imagz)));
155 number = quarterPoints * 4;
156 int16_t* c16Ptr = (int16_t*)&cVector[number];
157 int8_t* a8Ptr = (int8_t*)&aVector[number];
158 int8_t* b8Ptr = (int8_t*)&bVector[number];
159 for (; number < num_points; number++) {
160 float aReal = (float)*a8Ptr++;
161 float aImag = (float)*a8Ptr++;
163 float bReal = (float)*b8Ptr++;
164 float bImag = (float)*b8Ptr++;
168 *c16Ptr++ = (int16_t)
lv_creal(temp);
169 *c16Ptr++ = (int16_t)
lv_cimag(temp);
174 #ifdef LV_HAVE_GENERIC
186 unsigned int num_points)
188 unsigned int number = 0;
189 int16_t* c16Ptr = (int16_t*)cVector;
190 int8_t* a8Ptr = (int8_t*)aVector;
191 int8_t* b8Ptr = (int8_t*)bVector;
192 for (number = 0; number < num_points; number++) {
193 float aReal = (float)*a8Ptr++;
194 float aImag = (float)*a8Ptr++;
196 float bReal = (float)*b8Ptr++;
197 float bImag = (float)*b8Ptr++;
201 *c16Ptr++ = (int16_t)
lv_creal(temp);
202 *c16Ptr++ = (int16_t)
lv_cimag(temp);
209 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
210 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
212 #include <inttypes.h>
217 #include <immintrin.h>
226 static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(
lv_16sc_t* cVector,
229 unsigned int num_points)
231 unsigned int number = 0;
232 const unsigned int oneEigthPoints = num_points / 8;
234 __m256i x, y, realz, imagz;
238 __m256i conjugateSign =
239 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
241 for (; number < oneEigthPoints; number++) {
243 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
244 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
247 realz = _mm256_madd_epi16(x, y);
250 y = _mm256_sign_epi16(y, conjugateSign);
253 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
254 _MM_SHUFFLE(2, 3, 0, 1));
257 imagz = _mm256_madd_epi16(x, y);
261 _mm256_storeu_si256((__m256i*)c,
262 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
263 _mm256_unpackhi_epi32(realz, imagz)));
270 number = oneEigthPoints * 8;
271 int16_t* c16Ptr = (int16_t*)&cVector[number];
272 int8_t* a8Ptr = (int8_t*)&aVector[number];
273 int8_t* b8Ptr = (int8_t*)&bVector[number];
274 for (; number < num_points; number++) {
275 float aReal = (float)*a8Ptr++;
276 float aImag = (float)*a8Ptr++;
278 float bReal = (float)*b8Ptr++;
279 float bImag = (float)*b8Ptr++;
283 *c16Ptr++ = (int16_t)
lv_creal(temp);
284 *c16Ptr++ = (int16_t)
lv_cimag(temp);