Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_64f_add_64f_H
74 #define INCLUDED_volk_32f_64f_add_64f_H
75 
76 #include <inttypes.h>
77 
78 #ifdef LV_HAVE_GENERIC
79 
80 static inline void volk_32f_64f_add_64f_generic(double* cVector,
81  const float* aVector,
82  const double* bVector,
83  unsigned int num_points)
84 {
85  double* cPtr = cVector;
86  const float* aPtr = aVector;
87  const double* bPtr = bVector;
88  unsigned int number = 0;
89 
90  for (number = 0; number < num_points; number++) {
91  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
92  }
93 }
94 
95 #endif /* LV_HAVE_GENERIC */
96 
97 #ifdef LV_HAVE_NEONV8
98 #include <arm_neon.h>
99 
100 static inline void volk_32f_64f_add_64f_neon(double* cVector,
101  const float* aVector,
102  const double* bVector,
103  unsigned int num_points)
104 {
105  unsigned int number = 0;
106  const unsigned int half_points = num_points / 2;
107 
108  double* cPtr = cVector;
109  const float* aPtr = aVector;
110  const double* bPtr = bVector;
111 
112  float64x2_t aVal, bVal, cVal;
113  float32x2_t aVal1;
114  for (number = 0; number < half_points; number++) {
115  // Load in to NEON registers
116  aVal1 = vld1_f32(aPtr);
117  bVal = vld1q_f64(bPtr);
118  __VOLK_PREFETCH(aPtr + 2);
119  __VOLK_PREFETCH(bPtr + 2);
120  aPtr += 2; // q uses quadwords, 4 floats per vadd
121  bPtr += 2;
122 
123  // Vector conversion
124  aVal = vcvt_f64_f32(aVal1);
125  // vector add
126  cVal = vaddq_f64(aVal, bVal);
127  // Store the results back into the C container
128  vst1q_f64(cPtr, cVal);
129 
130  cPtr += 2;
131  }
132 
133  number = half_points * 2; // should be = num_points
134  for (; number < num_points; number++) {
135  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
136  }
137 }
138 
139 #endif /* LV_HAVE_NEONV8 */
140 
141 #ifdef LV_HAVE_AVX
142 
143 #include <immintrin.h>
144 #include <xmmintrin.h>
145 
146 static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
147  const float* aVector,
148  const double* bVector,
149  unsigned int num_points)
150 {
151  unsigned int number = 0;
152  const unsigned int eighth_points = num_points / 8;
153 
154  double* cPtr = cVector;
155  const float* aPtr = aVector;
156  const double* bPtr = bVector;
157 
158  __m256 aVal;
159  __m128 aVal1, aVal2;
160  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
161  for (; number < eighth_points; number++) {
162 
163  aVal = _mm256_loadu_ps(aPtr);
164  bVal1 = _mm256_loadu_pd(bPtr);
165  bVal2 = _mm256_loadu_pd(bPtr + 4);
166 
167  aVal1 = _mm256_extractf128_ps(aVal, 0);
168  aVal2 = _mm256_extractf128_ps(aVal, 1);
169 
170  aDbl1 = _mm256_cvtps_pd(aVal1);
171  aDbl2 = _mm256_cvtps_pd(aVal2);
172 
173  cVal1 = _mm256_add_pd(aDbl1, bVal1);
174  cVal2 = _mm256_add_pd(aDbl2, bVal2);
175 
176  _mm256_storeu_pd(cPtr,
177  cVal1); // Store the results back into the C container
178  _mm256_storeu_pd(cPtr + 4,
179  cVal2); // Store the results back into the C container
180 
181  aPtr += 8;
182  bPtr += 8;
183  cPtr += 8;
184  }
185 
186  number = eighth_points * 8;
187  for (; number < num_points; number++) {
188  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
189  }
190 }
191 
192 #endif /* LV_HAVE_AVX */
193 
194 #ifdef LV_HAVE_AVX
195 
196 #include <immintrin.h>
197 #include <xmmintrin.h>
198 
199 static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
200  const float* aVector,
201  const double* bVector,
202  unsigned int num_points)
203 {
204  unsigned int number = 0;
205  const unsigned int eighth_points = num_points / 8;
206 
207  double* cPtr = cVector;
208  const float* aPtr = aVector;
209  const double* bPtr = bVector;
210 
211  __m256 aVal;
212  __m128 aVal1, aVal2;
213  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
214  for (; number < eighth_points; number++) {
215 
216  aVal = _mm256_load_ps(aPtr);
217  bVal1 = _mm256_load_pd(bPtr);
218  bVal2 = _mm256_load_pd(bPtr + 4);
219 
220  aVal1 = _mm256_extractf128_ps(aVal, 0);
221  aVal2 = _mm256_extractf128_ps(aVal, 1);
222 
223  aDbl1 = _mm256_cvtps_pd(aVal1);
224  aDbl2 = _mm256_cvtps_pd(aVal2);
225 
226  cVal1 = _mm256_add_pd(aDbl1, bVal1);
227  cVal2 = _mm256_add_pd(aDbl2, bVal2);
228 
229  _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
230  _mm256_store_pd(cPtr + 4,
231  cVal2); // Store the results back into the C container
232 
233  aPtr += 8;
234  bPtr += 8;
235  cPtr += 8;
236  }
237 
238  number = eighth_points * 8;
239  for (; number < num_points; number++) {
240  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
241  }
242 }
243 
244 #endif /* LV_HAVE_AVX */
245 
246 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
volk_32f_64f_add_64f_generic
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:80
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32f_64f_add_64f_a_avx
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:199
volk_32f_64f_add_64f_u_avx
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:146