commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 02/02: volk: adding arm_neon.h headers to r


From: git
Subject: [Commit-gnuradio] [gnuradio] 02/02: volk: adding arm_neon.h headers to remaining neon proto-kernels.
Date: Wed, 30 Jul 2014 23:17:15 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

trondeau pushed a commit to branch master
in repository gnuradio.

commit 265e729189ad55edf4f5447cb42589e980b92efa
Author: Tom Rondeau <address@hidden>
Date:   Wed Jul 30 18:14:12 2014 -0400

    volk: adding arm_neon.h headers to remaining neon proto-kernels.
---
 volk/kernels/volk/volk_32f_sqrt_32f.h           |  2 ++
 volk/kernels/volk/volk_32f_x2_dot_prod_32f.h    |  2 ++
 volk/kernels/volk/volk_32f_x2_min_32f.h         |  2 ++
 volk/kernels/volk/volk_32f_x2_multiply_32f.h    |  4 +++-
 volk/kernels/volk/volk_32f_x2_subtract_32f.h    |  2 ++
 volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h |  1 +
 volk/kernels/volk/volk_32fc_magnitude_32f.h     | 16 +++++++++-------
 volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h  | 19 ++++++++++---------
 volk/kernels/volk/volk_32fc_x2_multiply_32fc.h  | 12 +++++++-----
 volk/kernels/volk/volk_8i_convert_16i.h         |  4 +++-
 10 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h 
b/volk/kernels/volk/volk_32f_sqrt_32f.h
index 2523abf..f8f8cbd 100644
--- a/volk/kernels/volk/volk_32f_sqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -41,6 +41,8 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, 
const float* aVector,
 #endif /* LV_HAVE_SSE */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
 /*!
   \brief Sqrts the two input vectors and store their results in the third 
vector
   \param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h 
b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
index e8fa8b5..ac6f569 100644
--- a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -578,6 +578,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_avx( float* 
result, const  float*
 #endif /*LV_HAVE_AVX*/
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
 static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const 
float * input, const float * taps, unsigned int num_points) {
 
     unsigned int quarter_points = num_points / 16;
diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h 
b/volk/kernels/volk/volk_32f_x2_min_32f.h
index eef5e5d..f7598d6 100644
--- a/volk/kernels/volk/volk_32f_x2_min_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -46,6 +46,8 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, 
const float* aVecto
 #endif /* LV_HAVE_SSE */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
 /*!
   \brief Selects minimum value from each entry between bVector and aVector and 
store their results in the cVector
   \param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h 
b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 8bbd81c..00b3185 100644
--- a/volk/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -189,6 +189,8 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* 
cVector, const float* a
 #endif /* LV_HAVE_AVX */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
 /*!
   \brief Multiplys the two input vectors and store their results in the third 
vector
   \param cVector The vector where the results will be stored
@@ -212,7 +214,7 @@ static inline void volk_32f_x2_multiply_32f_neon(float* 
cVector, const float* aV
     for(number=quarter_points*4; number < num_points; ++number) {
         *cVector++ = *aVector++ * *bVector++;
     }
-} 
+}
 #endif /* LV_HAVE_NEON */
 
 #ifdef LV_HAVE_GENERIC
diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h 
b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index 6831d89..c725ef8 100644
--- a/volk/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -64,6 +64,8 @@ static inline void volk_32f_x2_subtract_32f_generic(float* 
cVector, const float*
 #endif /* LV_HAVE_GENERIC */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
 /*!
   \brief Subtracts bVector form aVector and store their results in the cVector
   \param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h 
b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index d566231..0d3c216 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -294,6 +294,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* 
target, float* src0,
 #endif // LV_HAVE_AVX
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
 
 static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict 
target, float* __restrict src0, float* __restrict center_point_array, float* 
__restrict cutoff, unsigned int num_points) {
 
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h 
b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index cf3e849..b6da7f3 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -234,6 +234,8 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* 
magnitudeVector, con
 #endif /* LV_HAVE_GENERIC */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
   /*!
     \brief Calculates the magnitude of the complexVector and stores the 
results in the magnitudeVector
     \param complexVector The vector containing the complex input values
@@ -255,7 +257,7 @@ static inline void volk_32fc_magnitude_32f_neon(float* 
magnitudeVector, const lv
     magnitude_vec = vrsqrteq_f32(magnitude_vec);
     magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
         vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-        
+
         complexVectorPtr += 8;
         magnitudeVectorPtr += 4;
     }
@@ -271,8 +273,8 @@ static inline void volk_32fc_magnitude_32f_neon(float* 
magnitudeVector, const lv
 #ifdef LV_HAVE_NEON
   /*!
     \brief Calculates the magnitude of the complexVector and stores the 
results in the magnitudeVector
-    
-    This is an approximation from "Streamlining Digital Signal Processing" by 
+
+    This is an approximation from "Streamlining Digital Signal Processing" by
     Richard Lyons. Apparently max error is about 1% and mean error is about 
0.6%.
     The basic idea is to do a weighted sum of the abs. value of imag and real 
parts
     where weight A is always assigned to max(imag, real) and B is always 
min(imag,real).
@@ -291,7 +293,7 @@ static inline void 
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
 
     const float threshold = 0.4142135;
 
-    float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; 
+    float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
     a_high = vdupq_n_f32( 0.84 );
     b_high = vdupq_n_f32( 0.561);
     a_low  = vdupq_n_f32( 0.99 );
@@ -304,7 +306,7 @@ static inline void 
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
     float32x4_t real_abs, imag_abs;
     for(number = 0; number < quarter_points; number++){
         complex_vec = vld2q_f32(complexVectorPtr);
-        
+
         real_abs = vabsq_f32(complex_vec.val[0]);
         imag_abs = vabsq_f32(complex_vec.val[1]);
 
@@ -318,14 +320,14 @@ static inline void 
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
         // and 0s or 1s with coefficients from previous effective branch
         a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, 
(int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
         b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, 
(int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
-        
+
         // coefficients chosen, do the weighted sum
         min_vec = vmulq_f32(min_vec, b_vec);
         max_vec = vmulq_f32(max_vec, a_vec);
 
         magnitude_vec = vaddq_f32(min_vec, max_vec);
         vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-        
+
         complexVectorPtr += 8;
         magnitudeVectorPtr += 4;
     }
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h 
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 5301c35..430b747 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -761,12 +761,13 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 #endif /*LV_HAVE_SSE4_1*/
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
 
 static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const 
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
 
     unsigned int quarter_points = num_points / 4;
     unsigned int number;
-    
+
     lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
     lv_32fc_t* b_ptr = (lv_32fc_t*) input;
     // for 2-lane vectors, 1st lane holds the real part,
@@ -775,7 +776,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
     float32x4x2_t tmp_real, tmp_imag;
     accumulator.val[0] = vdupq_n_f32(0);
     accumulator.val[1] = vdupq_n_f32(0);
-    
+
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -796,7 +797,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
 
         c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
         c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
-        
+
         accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
         accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
 
@@ -821,7 +822,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
 
     unsigned int quarter_points = num_points / 4;
     unsigned int number;
-    
+
     lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
     lv_32fc_t* b_ptr = (lv_32fc_t*) input;
     // for 2-lane vectors, 1st lane holds the real part,
@@ -830,7 +831,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
     float32x4x2_t tmp_imag;
     accumulator.val[0] = vdupq_n_f32(0);
     accumulator.val[1] = vdupq_n_f32(0);
-    
+
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -869,7 +870,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
 
     unsigned int quarter_points = num_points / 4;
     unsigned int number;
-    
+
     lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
     lv_32fc_t* b_ptr = (lv_32fc_t*) input;
     // for 2-lane vectors, 1st lane holds the real part,
@@ -879,7 +880,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
     accumulator1.val[1] = vdupq_n_f32(0);
     accumulator2.val[0] = vdupq_n_f32(0);
     accumulator2.val[1] = vdupq_n_f32(0);
-    
+
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -915,7 +916,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
 
     unsigned int quarter_points = num_points / 8;
     unsigned int number;
-    
+
     lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
     lv_32fc_t* b_ptr = (lv_32fc_t*) input;
     // for 2-lane vectors, 1st lane holds the real part,
@@ -930,7 +931,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
     accumulator2.val[1] = vdupq_n_f32(0);
     accumulator2.val[2] = vdupq_n_f32(0);
     accumulator2.val[3] = vdupq_n_f32(0);
-    
+
     // 8 input regs, 8 accumulators -> 16/16 neon regs are used
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h 
b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 8d2d48b..7c723bc 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -150,6 +150,8 @@ static inline void 
volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
 #endif /* LV_HAVE_GENERIC */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
   /*!
     \brief Multiplies the two input complex vectors and stores their results 
in the third vector
     \param cVector The vector where the results will be stored
@@ -174,15 +176,15 @@ static inline void 
volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_
 
         // multiply the real*real and imag*imag to get real result
         // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); 
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
         // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); 
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
 
         // Multiply cross terms to get the imaginary result
         // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); 
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
         // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); 
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
 
         // store the results
         c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
@@ -225,7 +227,7 @@ static inline void 
volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
         __builtin_prefetch(b_ptr+4);
 
         // do the first multiply
-        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); 
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
         tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
 
         // use multiply accumulate/subtract to get result
diff --git a/volk/kernels/volk/volk_8i_convert_16i.h 
b/volk/kernels/volk/volk_8i_convert_16i.h
index 3b89a3f..9776dfd 100644
--- a/volk/kernels/volk/volk_8i_convert_16i.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -139,6 +139,8 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* 
outputVector, const in
 #endif /* LV_HAVE_GENERIC */
 
 #ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
   /*!
     \brief Converts the input 8 bit integer data into 16 bit integer data
     \param inputVector The 8 bit input data buffer
@@ -155,7 +157,7 @@ static inline void volk_8i_convert_16i_neon(int16_t* 
outputVector, const int8_t*
     int8x8_t input_vec ;
     int16x8_t converted_vec;
 
-    // NEON doesn't have a concept of 8 bit registers, so we are really 
+    // NEON doesn't have a concept of 8 bit registers, so we are really
     // dealing with the low half of 16-bit registers. Since this requires
     // a move instruction we likely do better with ASM here.
     for(number = 0; number < eighth_points; ++number) {



reply via email to

[Prev in Thread] Current Thread [Next in Thread]