diff options
author | Chris Robinson <chris.kcat@gmail.com> | 2023-10-06 18:56:47 -0700 |
---|---|---|
committer | Chris Robinson <chris.kcat@gmail.com> | 2023-10-06 18:56:47 -0700 |
commit | 1614fccd9fd893e104dcca2c92b83b2a7bfaa0c7 (patch) | |
tree | f71fb574d72630315cdbb1056ae742ae219ab8b5 /common/phase_shifter.h | |
parent | 3ccde151c609ba8fed59f07277ca5c719b2b92fc (diff) |
Improve NEON shuffling
Diffstat (limited to 'common/phase_shifter.h')
-rw-r--r-- | common/phase_shifter.h | 24 |
1 files changed, 3 insertions, 21 deletions
diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 6b0ad512..b9c889c2 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -75,25 +75,6 @@ struct PhaseShifterT { private: #if defined(HAVE_NEON) - /* There doesn't seem to be NEON intrinsics to do this kind of stipple - * shuffling, so there's two custom methods for it. - */ - static auto shuffle_2020(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); - return ret; - } - static auto shuffle_3131(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); - return ret; - } static auto unpacklo(float32x4_t a, float32x4_t b) { float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; @@ -174,9 +155,10 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])}; const float32x4_t s0{vld1q_f32(&src[j*2])}; const float32x4_t s1{vld1q_f32(&src[j*2 + 4])}; + const float32x4x2_t values{vuzpq_f32(s0, s1)}; - r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs); - r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs); + r04 = vmlaq_f32(r04, values.val[0], coeffs); + r14 = vmlaq_f32(r14, values.val[1], coeffs); } src += 2; |