76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
84 #if LV_HAVE_AVX && LV_HAVE_FMA
85 #include <immintrin.h>
87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(
lv_32fc_t* cVector,
90 unsigned int num_points)
92 unsigned int number = 0;
94 const unsigned int quarterPoints = num_points / 4;
95 unsigned int isodd = num_points & 3;
96 __m256 x, yl, yh, z, tmp1, tmp2;
101 yl = _mm256_set1_ps(
lv_creal(scalar));
102 yh = _mm256_set1_ps(
lv_cimag(scalar));
104 for (; number < quarterPoints; number++) {
105 x = _mm256_loadu_ps((
float*)a);
109 x = _mm256_shuffle_ps(x, x, 0xB1);
111 tmp2 = _mm256_mul_ps(x, yh);
113 z = _mm256_fmaddsub_ps(
116 _mm256_storeu_ps((
float*)c, z);
122 for (
i = num_points - isodd;
i < num_points;
i++) {
123 *c++ = (*a++) * scalar;
129 #include <immintrin.h>
134 unsigned int num_points)
136 unsigned int number = 0;
138 const unsigned int quarterPoints = num_points / 4;
139 unsigned int isodd = num_points & 3;
140 __m256 x, yl, yh, z, tmp1, tmp2;
145 yl = _mm256_set1_ps(
lv_creal(scalar));
146 yh = _mm256_set1_ps(
lv_cimag(scalar));
148 for (; number < quarterPoints; number++) {
149 x = _mm256_loadu_ps((
float*)a);
151 tmp1 = _mm256_mul_ps(x, yl);
153 x = _mm256_shuffle_ps(x, x, 0xB1);
155 tmp2 = _mm256_mul_ps(x, yh);
157 z = _mm256_addsub_ps(tmp1,
160 _mm256_storeu_ps((
float*)c, z);
166 for (
i = num_points - isodd;
i < num_points;
i++) {
167 *c++ = (*a++) * scalar;
173 #include <pmmintrin.h>
178 unsigned int num_points)
180 unsigned int number = 0;
181 const unsigned int halfPoints = num_points / 2;
183 __m128 x, yl, yh, z, tmp1, tmp2;
191 for (; number < halfPoints; number++) {
193 x = _mm_loadu_ps((
float*)a);
195 tmp1 = _mm_mul_ps(x, yl);
197 x = _mm_shuffle_ps(x, x, 0xB1);
199 tmp2 = _mm_mul_ps(x, yh);
201 z = _mm_addsub_ps(tmp1,
204 _mm_storeu_ps((
float*)c, z);
210 if ((num_points % 2) != 0) {
216 #ifdef LV_HAVE_GENERIC
221 unsigned int num_points)
225 unsigned int number = num_points;
228 while (number >= 8) {
229 *cPtr++ = (*aPtr++) * scalar;
230 *cPtr++ = (*aPtr++) * scalar;
231 *cPtr++ = (*aPtr++) * scalar;
232 *cPtr++ = (*aPtr++) * scalar;
233 *cPtr++ = (*aPtr++) * scalar;
234 *cPtr++ = (*aPtr++) * scalar;
235 *cPtr++ = (*aPtr++) * scalar;
236 *cPtr++ = (*aPtr++) * scalar;
242 *cPtr++ = *aPtr++ * scalar;
248 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
249 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
252 #include <inttypes.h>
256 #if LV_HAVE_AVX && LV_HAVE_FMA
257 #include <immintrin.h>
259 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(
lv_32fc_t* cVector,
262 unsigned int num_points)
264 unsigned int number = 0;
266 const unsigned int quarterPoints = num_points / 4;
267 unsigned int isodd = num_points & 3;
268 __m256 x, yl, yh, z, tmp1, tmp2;
273 yl = _mm256_set1_ps(
lv_creal(scalar));
274 yh = _mm256_set1_ps(
lv_cimag(scalar));
276 for (; number < quarterPoints; number++) {
277 x = _mm256_load_ps((
float*)a);
281 x = _mm256_shuffle_ps(x, x, 0xB1);
283 tmp2 = _mm256_mul_ps(x, yh);
285 z = _mm256_fmaddsub_ps(
288 _mm256_store_ps((
float*)c, z);
294 for (
i = num_points - isodd;
i < num_points;
i++) {
295 *c++ = (*a++) * scalar;
302 #include <immintrin.h>
307 unsigned int num_points)
309 unsigned int number = 0;
311 const unsigned int quarterPoints = num_points / 4;
312 unsigned int isodd = num_points & 3;
313 __m256 x, yl, yh, z, tmp1, tmp2;
318 yl = _mm256_set1_ps(
lv_creal(scalar));
319 yh = _mm256_set1_ps(
lv_cimag(scalar));
321 for (; number < quarterPoints; number++) {
322 x = _mm256_load_ps((
float*)a);
324 tmp1 = _mm256_mul_ps(x, yl);
326 x = _mm256_shuffle_ps(x, x, 0xB1);
328 tmp2 = _mm256_mul_ps(x, yh);
330 z = _mm256_addsub_ps(tmp1,
333 _mm256_store_ps((
float*)c, z);
339 for (
i = num_points - isodd;
i < num_points;
i++) {
340 *c++ = (*a++) * scalar;
346 #include <pmmintrin.h>
351 unsigned int num_points)
353 unsigned int number = 0;
354 const unsigned int halfPoints = num_points / 2;
356 __m128 x, yl, yh, z, tmp1, tmp2;
364 for (; number < halfPoints; number++) {
366 x = _mm_load_ps((
float*)a);
368 tmp1 = _mm_mul_ps(x, yl);
370 x = _mm_shuffle_ps(x, x, 0xB1);
372 tmp2 = _mm_mul_ps(x, yh);
374 z = _mm_addsub_ps(tmp1,
377 _mm_store_ps((
float*)c, z);
383 if ((num_points % 2) != 0) {
390 #include <arm_neon.h>
395 unsigned int num_points)
399 unsigned int number = num_points;
400 unsigned int quarter_points = num_points / 4;
402 float32x4x2_t a_val, scalar_val;
403 float32x4x2_t tmp_imag;
405 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
406 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
407 for (number = 0; number < quarter_points; ++number) {
408 a_val = vld2q_f32((
float*)aPtr);
409 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
410 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
412 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
413 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
415 vst2q_f32((
float*)cPtr, tmp_imag);
420 for (number = quarter_points * 4; number < num_points; number++) {
421 *cPtr++ = *aPtr++ * scalar;
426 #ifdef LV_HAVE_GENERIC
431 unsigned int num_points)
435 unsigned int number = num_points;
438 while (number >= 8) {
439 *cPtr++ = (*aPtr++) * scalar;
440 *cPtr++ = (*aPtr++) * scalar;
441 *cPtr++ = (*aPtr++) * scalar;
442 *cPtr++ = (*aPtr++) * scalar;
443 *cPtr++ = (*aPtr++) * scalar;
444 *cPtr++ = (*aPtr++) * scalar;
445 *cPtr++ = (*aPtr++) * scalar;
446 *cPtr++ = (*aPtr++) * scalar;
452 *cPtr++ = *aPtr++ * scalar;