55 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
56 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
62 #include <immintrin.h>
67 unsigned int num_points)
69 unsigned int number = 0;
70 const unsigned int eighthPoints = num_points / 8;
74 const float* bPtr = bVector;
76 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
78 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
80 for (; number < eighthPoints; number++) {
82 aVal1 = _mm256_load_ps((
float*)aPtr);
85 aVal2 = _mm256_load_ps((
float*)aPtr);
88 bVal = _mm256_load_ps(bPtr);
91 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00);
92 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11);
94 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask);
95 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask);
97 cVal1 = _mm256_mul_ps(aVal1, bVal1);
98 cVal2 = _mm256_mul_ps(aVal2, bVal2);
100 _mm256_store_ps((
float*)cPtr,
104 _mm256_store_ps((
float*)cPtr,
109 number = eighthPoints * 8;
110 for (; number < num_points; ++number) {
111 *cPtr++ = (*aPtr++) * (*bPtr++);
118 #include <xmmintrin.h>
122 const float* bVector,
123 unsigned int num_points)
125 unsigned int number = 0;
126 const unsigned int quarterPoints = num_points / 4;
130 const float* bPtr = bVector;
132 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
133 for (; number < quarterPoints; number++) {
135 aVal1 = _mm_load_ps((
const float*)aPtr);
138 aVal2 = _mm_load_ps((
const float*)aPtr);
141 bVal = _mm_load_ps(bPtr);
144 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
145 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
147 cVal = _mm_mul_ps(aVal1, bVal1);
149 _mm_store_ps((
float*)cPtr, cVal);
152 cVal = _mm_mul_ps(aVal2, bVal2);
154 _mm_store_ps((
float*)cPtr, cVal);
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *cPtr++ = (*aPtr++) * (*bPtr);
168 #ifdef LV_HAVE_GENERIC
172 const float* bVector,
173 unsigned int num_points)
177 const float* bPtr = bVector;
178 unsigned int number = 0;
180 for (number = 0; number < num_points; number++) {
181 *cPtr++ = (*aPtr++) * (*bPtr++);
188 #include <arm_neon.h>
192 const float* bVector,
193 unsigned int num_points)
197 const float* bPtr = bVector;
198 unsigned int number = 0;
199 unsigned int quarter_points = num_points / 4;
201 float32x4x2_t inputVector, outputVector;
202 float32x4_t tapsVector;
203 for (number = 0; number < quarter_points; number++) {
204 inputVector = vld2q_f32((
float*)aPtr);
205 tapsVector = vld1q_f32(bPtr);
207 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
208 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
210 vst2q_f32((
float*)cPtr, outputVector);
216 for (number = quarter_points * 4; number < num_points; number++) {
217 *cPtr++ = (*aPtr++) * (*bPtr++);
225 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
227 const float* bVector,
228 unsigned int num_points);
230 static inline void volk_32fc_32f_multiply_32fc_u_orc(
lv_32fc_t* cVector,
232 const float* bVector,
233 unsigned int num_points)
235 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);