Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
86 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
87 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
88 
89 #include <float.h>
90 #include <inttypes.h>
91 #include <stdio.h>
92 #include <volk/volk_complex.h>
93 
94 
95 #ifdef LV_HAVE_GENERIC
96 
97 static inline void
99  const lv_32fc_t* aVector,
100  const lv_32fc_t* bVector,
101  const lv_32fc_t scalar,
102  unsigned int num_points)
103 {
104  const lv_32fc_t* aPtr = aVector;
105  const lv_32fc_t* bPtr = bVector;
106  lv_32fc_t* cPtr = cVector;
107  unsigned int number = num_points;
108 
109  // unwrap loop
110  while (number >= 8) {
111  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
112  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
113  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
114  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
115  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
116  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
117  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
118  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
119  number -= 8;
120  }
121 
122  // clean up any remaining
123  while (number-- > 0) {
124  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
125  }
126 }
127 #endif /* LV_HAVE_GENERIC */
128 
129 
130 #ifdef LV_HAVE_AVX
131 #include <immintrin.h>
133 
134 static inline void
136  const lv_32fc_t* aVector,
137  const lv_32fc_t* bVector,
138  const lv_32fc_t scalar,
139  unsigned int num_points)
140 {
141  unsigned int number = 0;
142  unsigned int i = 0;
143  const unsigned int quarterPoints = num_points / 4;
144  unsigned int isodd = num_points & 3;
145 
146  __m256 x, y, s, z;
147  lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
148 
149  const lv_32fc_t* a = aVector;
150  const lv_32fc_t* b = bVector;
151  lv_32fc_t* c = cVector;
152 
153  // Set up constant scalar vector
154  s = _mm256_loadu_ps((float*)v_scalar);
155 
156  for (; number < quarterPoints; number++) {
157  x = _mm256_loadu_ps((float*)b);
158  y = _mm256_loadu_ps((float*)a);
160  z = _mm256_add_ps(y, z);
161  _mm256_storeu_ps((float*)c, z);
162 
163  a += 4;
164  b += 4;
165  c += 4;
166  }
167 
168  for (i = num_points - isodd; i < num_points; i++) {
169  *c++ = (*a++) + lv_conj(*b++) * scalar;
170  }
171 }
172 #endif /* LV_HAVE_AVX */
173 
174 
175 #ifdef LV_HAVE_SSE3
176 #include <pmmintrin.h>
178 
179 static inline void
181  const lv_32fc_t* aVector,
182  const lv_32fc_t* bVector,
183  const lv_32fc_t scalar,
184  unsigned int num_points)
185 {
186  unsigned int number = 0;
187  const unsigned int halfPoints = num_points / 2;
188 
189  __m128 x, y, s, z;
190  lv_32fc_t v_scalar[2] = { scalar, scalar };
191 
192  const lv_32fc_t* a = aVector;
193  const lv_32fc_t* b = bVector;
194  lv_32fc_t* c = cVector;
195 
196  // Set up constant scalar vector
197  s = _mm_loadu_ps((float*)v_scalar);
198 
199  for (; number < halfPoints; number++) {
200  x = _mm_loadu_ps((float*)b);
201  y = _mm_loadu_ps((float*)a);
202  z = _mm_complexconjugatemul_ps(s, x);
203  z = _mm_add_ps(y, z);
204  _mm_storeu_ps((float*)c, z);
205 
206  a += 2;
207  b += 2;
208  c += 2;
209  }
210 
211  if ((num_points % 2) != 0) {
212  *c = *a + lv_conj(*b) * scalar;
213  }
214 }
215 #endif /* LV_HAVE_SSE */
216 
217 
218 #ifdef LV_HAVE_AVX
219 #include <immintrin.h>
221 
222 static inline void
224  const lv_32fc_t* aVector,
225  const lv_32fc_t* bVector,
226  const lv_32fc_t scalar,
227  unsigned int num_points)
228 {
229  unsigned int number = 0;
230  unsigned int i = 0;
231  const unsigned int quarterPoints = num_points / 4;
232  unsigned int isodd = num_points & 3;
233 
234  __m256 x, y, s, z;
235  lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
236 
237  const lv_32fc_t* a = aVector;
238  const lv_32fc_t* b = bVector;
239  lv_32fc_t* c = cVector;
240 
241  // Set up constant scalar vector
242  s = _mm256_loadu_ps((float*)v_scalar);
243 
244  for (; number < quarterPoints; number++) {
245  x = _mm256_load_ps((float*)b);
246  y = _mm256_load_ps((float*)a);
248  z = _mm256_add_ps(y, z);
249  _mm256_store_ps((float*)c, z);
250 
251  a += 4;
252  b += 4;
253  c += 4;
254  }
255 
256  for (i = num_points - isodd; i < num_points; i++) {
257  *c++ = (*a++) + lv_conj(*b++) * scalar;
258  }
259 }
260 #endif /* LV_HAVE_AVX */
261 
262 
263 #ifdef LV_HAVE_SSE3
264 #include <pmmintrin.h>
266 
267 static inline void
269  const lv_32fc_t* aVector,
270  const lv_32fc_t* bVector,
271  const lv_32fc_t scalar,
272  unsigned int num_points)
273 {
274  unsigned int number = 0;
275  const unsigned int halfPoints = num_points / 2;
276 
277  __m128 x, y, s, z;
278  lv_32fc_t v_scalar[2] = { scalar, scalar };
279 
280  const lv_32fc_t* a = aVector;
281  const lv_32fc_t* b = bVector;
282  lv_32fc_t* c = cVector;
283 
284  // Set up constant scalar vector
285  s = _mm_loadu_ps((float*)v_scalar);
286 
287  for (; number < halfPoints; number++) {
288  x = _mm_load_ps((float*)b);
289  y = _mm_load_ps((float*)a);
290  z = _mm_complexconjugatemul_ps(s, x);
291  z = _mm_add_ps(y, z);
292  _mm_store_ps((float*)c, z);
293 
294  a += 2;
295  b += 2;
296  c += 2;
297  }
298 
299  if ((num_points % 2) != 0) {
300  *c = *a + lv_conj(*b) * scalar;
301  }
302 }
303 #endif /* LV_HAVE_SSE */
304 
305 
306 #ifdef LV_HAVE_NEON
307 #include <arm_neon.h>
308 
309 static inline void
311  const lv_32fc_t* aVector,
312  const lv_32fc_t* bVector,
313  const lv_32fc_t scalar,
314  unsigned int num_points)
315 {
316  const lv_32fc_t* bPtr = bVector;
317  const lv_32fc_t* aPtr = aVector;
318  lv_32fc_t* cPtr = cVector;
319  unsigned int number = num_points;
320  unsigned int quarter_points = num_points / 4;
321 
322  float32x4x2_t a_val, b_val, c_val, scalar_val;
323  float32x4x2_t tmp_val;
324 
325  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
326  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
327 
328  for (number = 0; number < quarter_points; ++number) {
329  a_val = vld2q_f32((float*)aPtr);
330  b_val = vld2q_f32((float*)bPtr);
331  b_val.val[1] = vnegq_f32(b_val.val[1]);
332  __VOLK_PREFETCH(aPtr + 8);
333  __VOLK_PREFETCH(bPtr + 8);
334 
335  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
336  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
337 
338  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
339  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
340 
341  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
342  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
343 
344  vst2q_f32((float*)cPtr, c_val);
345 
346  aPtr += 4;
347  bPtr += 4;
348  cPtr += 4;
349  }
350 
351  for (number = quarter_points * 4; number < num_points; number++) {
352  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
353  }
354 }
355 #endif /* LV_HAVE_NEON */
356 
357 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
volk_sse3_intrinsics.h
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:223
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:180
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:98
lv_conj
#define lv_conj(x)
Definition: volk_complex.h:96
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:135
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70
volk_complex.h
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:268
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
volk_avx_intrinsics.h
_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:310