Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
69 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
70 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
71 
72 #include <inttypes.h>
73 #include <stdio.h>
74 
75 #ifdef LV_HAVE_SSE
76 #include <xmmintrin.h>
77 
78 static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
79  const float* aVector,
80  const float scalar,
81  unsigned int num_points)
82 {
83  unsigned int number = 0;
84  const unsigned int quarterPoints = num_points / 4;
85 
86  float* cPtr = cVector;
87  const float* aPtr = aVector;
88 
89  __m128 aVal, bVal, cVal;
90  bVal = _mm_set_ps1(scalar);
91  for (; number < quarterPoints; number++) {
92  aVal = _mm_loadu_ps(aPtr);
93 
94  cVal = _mm_mul_ps(aVal, bVal);
95 
96  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
97 
98  aPtr += 4;
99  cPtr += 4;
100  }
101 
102  number = quarterPoints * 4;
103  for (; number < num_points; number++) {
104  *cPtr++ = (*aPtr++) * scalar;
105  }
106 }
107 #endif /* LV_HAVE_SSE */
108 
109 #ifdef LV_HAVE_AVX
110 #include <immintrin.h>
111 
112 static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
113  const float* aVector,
114  const float scalar,
115  unsigned int num_points)
116 {
117  unsigned int number = 0;
118  const unsigned int eighthPoints = num_points / 8;
119 
120  float* cPtr = cVector;
121  const float* aPtr = aVector;
122 
123  __m256 aVal, bVal, cVal;
124  bVal = _mm256_set1_ps(scalar);
125  for (; number < eighthPoints; number++) {
126 
127  aVal = _mm256_loadu_ps(aPtr);
128 
129  cVal = _mm256_mul_ps(aVal, bVal);
130 
131  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
132 
133  aPtr += 8;
134  cPtr += 8;
135  }
136 
137  number = eighthPoints * 8;
138  for (; number < num_points; number++) {
139  *cPtr++ = (*aPtr++) * scalar;
140  }
141 }
142 #endif /* LV_HAVE_AVX */
143 
144 #ifdef LV_HAVE_GENERIC
145 
146 static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
147  const float* aVector,
148  const float scalar,
149  unsigned int num_points)
150 {
151  unsigned int number = 0;
152  const float* inputPtr = aVector;
153  float* outputPtr = cVector;
154  for (number = 0; number < num_points; number++) {
155  *outputPtr = (*inputPtr) * scalar;
156  inputPtr++;
157  outputPtr++;
158  }
159 }
160 #endif /* LV_HAVE_GENERIC */
161 
162 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
163 
164 
165 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
166 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
167 
168 #include <inttypes.h>
169 #include <stdio.h>
170 
171 #ifdef LV_HAVE_SSE
172 #include <xmmintrin.h>
173 
174 static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
175  const float* aVector,
176  const float scalar,
177  unsigned int num_points)
178 {
179  unsigned int number = 0;
180  const unsigned int quarterPoints = num_points / 4;
181 
182  float* cPtr = cVector;
183  const float* aPtr = aVector;
184 
185  __m128 aVal, bVal, cVal;
186  bVal = _mm_set_ps1(scalar);
187  for (; number < quarterPoints; number++) {
188  aVal = _mm_load_ps(aPtr);
189 
190  cVal = _mm_mul_ps(aVal, bVal);
191 
192  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
193 
194  aPtr += 4;
195  cPtr += 4;
196  }
197 
198  number = quarterPoints * 4;
199  for (; number < num_points; number++) {
200  *cPtr++ = (*aPtr++) * scalar;
201  }
202 }
203 #endif /* LV_HAVE_SSE */
204 
205 #ifdef LV_HAVE_AVX
206 #include <immintrin.h>
207 
208 static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
209  const float* aVector,
210  const float scalar,
211  unsigned int num_points)
212 {
213  unsigned int number = 0;
214  const unsigned int eighthPoints = num_points / 8;
215 
216  float* cPtr = cVector;
217  const float* aPtr = aVector;
218 
219  __m256 aVal, bVal, cVal;
220  bVal = _mm256_set1_ps(scalar);
221  for (; number < eighthPoints; number++) {
222  aVal = _mm256_load_ps(aPtr);
223 
224  cVal = _mm256_mul_ps(aVal, bVal);
225 
226  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
227 
228  aPtr += 8;
229  cPtr += 8;
230  }
231 
232  number = eighthPoints * 8;
233  for (; number < num_points; number++) {
234  *cPtr++ = (*aPtr++) * scalar;
235  }
236 }
237 #endif /* LV_HAVE_AVX */
238 
239 #ifdef LV_HAVE_NEON
240 #include <arm_neon.h>
241 
242 static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
243  const float* aVector,
244  const float scalar,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248  const float* inputPtr = aVector;
249  float* outputPtr = cVector;
250  const unsigned int quarterPoints = num_points / 4;
251 
252  float32x4_t aVal, cVal;
253 
254  for (number = 0; number < quarterPoints; number++) {
255  aVal = vld1q_f32(inputPtr); // Load into NEON regs
256  cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
257  vst1q_f32(outputPtr, cVal); // Store results back to output
258  inputPtr += 4;
259  outputPtr += 4;
260  }
261  for (number = quarterPoints * 4; number < num_points; number++) {
262  *outputPtr++ = (*inputPtr++) * scalar;
263  }
264 }
265 #endif /* LV_HAVE_NEON */
266 
267 
268 #ifdef LV_HAVE_GENERIC
269 
270 static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
271  const float* aVector,
272  const float scalar,
273  unsigned int num_points)
274 {
275  unsigned int number = 0;
276  const float* inputPtr = aVector;
277  float* outputPtr = cVector;
278  for (number = 0; number < num_points; number++) {
279  *outputPtr = (*inputPtr) * scalar;
280  inputPtr++;
281  outputPtr++;
282  }
283 }
284 #endif /* LV_HAVE_GENERIC */
285 
286 
287 #ifdef LV_HAVE_ORC
288 
289 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
290  const float* src,
291  const float scalar,
292  unsigned int num_points);
293 
294 static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
295  const float* aVector,
296  const float scalar,
297  unsigned int num_points)
298 {
299  volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
300 }
301 
302 #endif /* LV_HAVE_GENERIC */
303 
304 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
static void volk_32f_s32f_multiply_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:208
static void volk_32f_s32f_multiply_32f_a_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:270
static void volk_32f_s32f_multiply_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:78
static void volk_32f_s32f_multiply_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:112
static void volk_32f_s32f_multiply_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:174
static void volk_32f_s32f_multiply_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:146
static void volk_32f_s32f_multiply_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:242