From da845ddd9c35a1e1fcff03ea342636ae4bb8018b Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Mon, 6 Feb 2023 17:46:32 -0800 Subject: Use an interpolated FIR filter for cubic resampling Similar to how the bsinc filters work, but optimized for 4-point filtering. At least the SSE version is notably faster than calculating the coefficients in real time. --- core/mixer/mixer_neon.cpp | 52 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'core/mixer/mixer_neon.cpp') diff --git a/core/mixer/mixer_neon.cpp b/core/mixer/mixer_neon.cpp index 5a91da60..e636ac74 100644 --- a/core/mixer/mixer_neon.cpp +++ b/core/mixer/mixer_neon.cpp @@ -7,11 +7,13 @@ #include "alnumeric.h" #include "core/bsinc_defs.h" +#include "core/cubic_defs.h" #include "defs.h" #include "hrtfbase.h" struct NEONTag; struct LerpTag; +struct CubicTag; struct BSincTag; struct FastBSincTag; @@ -22,6 +24,14 @@ struct FastBSincTag; namespace { +constexpr uint BSincPhaseBitDiff{MixerFracBits - BSincPhaseBits}; +constexpr uint BSincPhaseDiffOne{1 << BSincPhaseBitDiff}; +constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u}; + +constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; + inline float32x4_t set_f4(float l0, float l1, float l2, float l3) { float32x4_t ret{vmovq_n_f32(l0)}; @@ -31,9 +41,6 @@ inline float32x4_t set_f4(float l0, float l1, float l2, float l3) return ret; } -constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits}; -constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff}; - inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs, const float left, const float right) { @@ -183,6 +190,37 @@ float *Resample_(const InterpState*, float *RESTRICT src, uint return dst.data(); } +template<> +float *Resample_(const InterpState *state, float *RESTRICT src, uint frac, + uint increment, const al::span dst) +{ + const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter); + + src -= 1; + for(float &out_sample : dst) + { + const uint pi{frac >> CubicPhaseBitDiff}; + const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const float32x4_t pf4{vdupq_n_f32(pf)}; + + /* Apply the phase interpolated filter. */ + + /* f = fil + pf*phd */ + const float32x4_t f4 = vmlaq_f32(vld1q_f32(filter[pi].mCoeffs), pf4, + vld1q_f32(filter[pi].mDeltas)); + /* r = f*src */ + float32x4_t r4{vmulq_f32(f4, vld1q_f32(src))}; + + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + } + return dst.data(); +} + template<> float *Resample_(const InterpState *state, float *RESTRICT src, uint frac, uint increment, const al::span dst) @@ -196,8 +234,8 @@ float *Resample_(const InterpState *state, float *RESTRICT src for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the scale and phase interpolated filter. float32x4_t r4{vdupq_n_f32(0.0f)}; @@ -242,8 +280,8 @@ float *Resample_(const InterpState *state, float *RESTRICT for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the phase interpolated filter. float32x4_t r4{vdupq_n_f32(0.0f)}; -- cgit v1.2.3