Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_exp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /* SIMD (SSE4) implementation of exp
24  Inspired by Intel Approximate Math library, and based on the
25  corresponding algorithms of the cephes math library
26 */
27 
28 /* Copyright (C) 2007 Julien Pommier
29 
30  This software is provided 'as-is', without any express or implied
31  warranty. In no event will the authors be held liable for any damages
32  arising from the use of this software.
33 
34  Permission is granted to anyone to use this software for any purpose,
35  including commercial applications, and to alter it and redistribute it
36  freely, subject to the following restrictions:
37 
38  1. The origin of this software must not be misrepresented; you must not
39  claim that you wrote the original software. If you use this software
40  in a product, an acknowledgment in the product documentation would be
41  appreciated but is not required.
42  2. Altered source versions must be plainly marked as such, and must not be
43  misrepresented as being the original software.
44  3. This notice may not be removed or altered from any source distribution.
45 
46  (this is the zlib license)
47 */
48 
95 #include <inttypes.h>
96 #include <math.h>
97 #include <stdio.h>
98 
99 #ifndef INCLUDED_volk_32f_exp_32f_a_H
100 #define INCLUDED_volk_32f_exp_32f_a_H
101 
102 #ifdef LV_HAVE_SSE2
103 #include <emmintrin.h>
104 
105 static inline void
106 volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
107 {
108  float* bPtr = bVector;
109  const float* aPtr = aVector;
110 
111  unsigned int number = 0;
112  unsigned int quarterPoints = num_points / 4;
113 
114  // Declare variables and constants
115  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
116  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
117  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
118  __m128i emm0, pi32_0x7f;
119 
120  one = _mm_set1_ps(1.0);
121  exp_hi = _mm_set1_ps(88.3762626647949);
122  exp_lo = _mm_set1_ps(-88.3762626647949);
123  log2EF = _mm_set1_ps(1.44269504088896341);
124  half = _mm_set1_ps(0.5);
125  exp_C1 = _mm_set1_ps(0.693359375);
126  exp_C2 = _mm_set1_ps(-2.12194440e-4);
127  pi32_0x7f = _mm_set1_epi32(0x7f);
128 
129  exp_p0 = _mm_set1_ps(1.9875691500e-4);
130  exp_p1 = _mm_set1_ps(1.3981999507e-3);
131  exp_p2 = _mm_set1_ps(8.3334519073e-3);
132  exp_p3 = _mm_set1_ps(4.1665795894e-2);
133  exp_p4 = _mm_set1_ps(1.6666665459e-1);
134  exp_p5 = _mm_set1_ps(5.0000001201e-1);
135 
136  for (; number < quarterPoints; number++) {
137  aVal = _mm_load_ps(aPtr);
138  tmp = _mm_setzero_ps();
139 
140  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
141 
142  /* express exp(x) as exp(g + n*log(2)) */
143  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
144 
145  emm0 = _mm_cvttps_epi32(fx);
146  tmp = _mm_cvtepi32_ps(emm0);
147 
148  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
149  fx = _mm_sub_ps(tmp, mask);
150 
151  tmp = _mm_mul_ps(fx, exp_C1);
152  z = _mm_mul_ps(fx, exp_C2);
153  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
154  z = _mm_mul_ps(aVal, aVal);
155 
156  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
157  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
158  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
159  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
160  y = _mm_add_ps(y, one);
161 
162  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
163 
164  pow2n = _mm_castsi128_ps(emm0);
165  bVal = _mm_mul_ps(y, pow2n);
166 
167  _mm_store_ps(bPtr, bVal);
168  aPtr += 4;
169  bPtr += 4;
170  }
171 
172  number = quarterPoints * 4;
173  for (; number < num_points; number++) {
174  *bPtr++ = expf(*aPtr++);
175  }
176 }
177 
178 #endif /* LV_HAVE_SSE2 for aligned */
179 
180 
181 #ifdef LV_HAVE_GENERIC
182 
183 static inline void
184 volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
185 {
186  float* bPtr = bVector;
187  const float* aPtr = aVector;
188  unsigned int number = 0;
189 
190  for (number = 0; number < num_points; number++) {
191  *bPtr++ = expf(*aPtr++);
192  }
193 }
194 
195 #endif /* LV_HAVE_GENERIC */
196 
197 #endif /* INCLUDED_volk_32f_exp_32f_a_H */
198 
199 #ifndef INCLUDED_volk_32f_exp_32f_u_H
200 #define INCLUDED_volk_32f_exp_32f_u_H
201 
202 #ifdef LV_HAVE_SSE2
203 #include <emmintrin.h>
204 
205 static inline void
206 volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
207 {
208  float* bPtr = bVector;
209  const float* aPtr = aVector;
210 
211  unsigned int number = 0;
212  unsigned int quarterPoints = num_points / 4;
213 
214  // Declare variables and constants
215  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
216  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
217  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
218  __m128i emm0, pi32_0x7f;
219 
220  one = _mm_set1_ps(1.0);
221  exp_hi = _mm_set1_ps(88.3762626647949);
222  exp_lo = _mm_set1_ps(-88.3762626647949);
223  log2EF = _mm_set1_ps(1.44269504088896341);
224  half = _mm_set1_ps(0.5);
225  exp_C1 = _mm_set1_ps(0.693359375);
226  exp_C2 = _mm_set1_ps(-2.12194440e-4);
227  pi32_0x7f = _mm_set1_epi32(0x7f);
228 
229  exp_p0 = _mm_set1_ps(1.9875691500e-4);
230  exp_p1 = _mm_set1_ps(1.3981999507e-3);
231  exp_p2 = _mm_set1_ps(8.3334519073e-3);
232  exp_p3 = _mm_set1_ps(4.1665795894e-2);
233  exp_p4 = _mm_set1_ps(1.6666665459e-1);
234  exp_p5 = _mm_set1_ps(5.0000001201e-1);
235 
236 
237  for (; number < quarterPoints; number++) {
238  aVal = _mm_loadu_ps(aPtr);
239  tmp = _mm_setzero_ps();
240 
241  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
242 
243  /* express exp(x) as exp(g + n*log(2)) */
244  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
245 
246  emm0 = _mm_cvttps_epi32(fx);
247  tmp = _mm_cvtepi32_ps(emm0);
248 
249  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
250  fx = _mm_sub_ps(tmp, mask);
251 
252  tmp = _mm_mul_ps(fx, exp_C1);
253  z = _mm_mul_ps(fx, exp_C2);
254  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
255  z = _mm_mul_ps(aVal, aVal);
256 
257  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
258  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
259  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
260  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
261  y = _mm_add_ps(y, one);
262 
263  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
264 
265  pow2n = _mm_castsi128_ps(emm0);
266  bVal = _mm_mul_ps(y, pow2n);
267 
268  _mm_storeu_ps(bPtr, bVal);
269  aPtr += 4;
270  bPtr += 4;
271  }
272 
273  number = quarterPoints * 4;
274  for (; number < num_points; number++) {
275  *bPtr++ = expf(*aPtr++);
276  }
277 }
278 
279 #endif /* LV_HAVE_SSE2 for unaligned */
280 
281 
282 #ifdef LV_HAVE_GENERIC
283 
284 static inline void
285 volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
286 {
287  float* bPtr = bVector;
288  const float* aPtr = aVector;
289  unsigned int number = 0;
290 
291  for (number = 0; number < num_points; number++) {
292  *bPtr++ = expf(*aPtr++);
293  }
294 }
295 
296 #endif /* LV_HAVE_GENERIC */
297 
298 #endif /* INCLUDED_volk_32f_exp_32f_u_H */
volk_32f_exp_32f_u_generic
static void volk_32f_exp_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:285
volk_32f_exp_32f_a_generic
static void volk_32f_exp_32f_a_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:184
volk_32f_exp_32f_a_sse2
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:106
volk_32f_exp_32f_u_sse2
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:206