From da845ddd9c35a1e1fcff03ea342636ae4bb8018b Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Mon, 6 Feb 2023 17:46:32 -0800 Subject: Use an interpolated FIR filter for cubic resampling Similar to how the bsinc filters work, but optimized for 4-point filtering. At least the SSE version is notably faster than calculating the coefficients in real time. --- core/mixer/mixer_sse.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'core/mixer/mixer_sse.cpp') diff --git a/core/mixer/mixer_sse.cpp b/core/mixer/mixer_sse.cpp index 1b0d1386..4a31a0f1 100644 --- a/core/mixer/mixer_sse.cpp +++ b/core/mixer/mixer_sse.cpp @@ -7,10 +7,12 @@ #include "alnumeric.h" #include "core/bsinc_defs.h" +#include "core/cubic_defs.h" #include "defs.h" #include "hrtfbase.h" struct SSETag; +struct CubicTag; struct BSincTag; struct FastBSincTag; @@ -21,8 +23,13 @@ struct FastBSincTag; namespace { -constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits}; -constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff}; +constexpr uint BSincPhaseBitDiff{MixerFracBits - BSincPhaseBits}; +constexpr uint BSincPhaseDiffOne{1 << BSincPhaseBitDiff}; +constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u}; + +constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) @@ -146,6 +153,38 @@ force_inline void MixLine(const al::span InSamples, float *RESTRICT } // namespace +template<> +float *Resample_(const InterpState *state, float *RESTRICT src, uint frac, + uint increment, const al::span dst) +{ + const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter); + + src -= 1; + for(float &out_sample : dst) + { + const uint pi{frac >> CubicPhaseBitDiff}; + const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const __m128 pf4{_mm_set1_ps(pf)}; + + /* Apply the phase interpolated filter. */ + + /* f = fil + pf*phd */ + const __m128 f4 = MLA4(_mm_load_ps(filter[pi].mCoeffs), pf4, + _mm_load_ps(filter[pi].mDeltas)); + /* r = f*src */ + __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))}; + + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + out_sample = _mm_cvtss_f32(r4); + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + } + return dst.data(); +} + template<> float *Resample_(const InterpState *state, float *RESTRICT src, uint frac, uint increment, const al::span dst) @@ -159,8 +198,8 @@ float *Resample_(const InterpState *state, float *RESTRICT src, for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the scale and phase interpolated filter. __m128 r4{_mm_setzero_ps()}; @@ -206,8 +245,8 @@ float *Resample_(const InterpState *state, float *RESTRICT for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the phase interpolated filter. __m128 r4{_mm_setzero_ps()}; -- cgit v1.2.3