aboutsummaryrefslogtreecommitdiffstats
path: root/core/mixer/mixer_sse.cpp
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-02-06 17:46:32 -0800
committerChris Robinson <[email protected]>2023-02-06 17:46:32 -0800
commitda845ddd9c35a1e1fcff03ea342636ae4bb8018b (patch)
tree8fe219826036410c655b7c732e94f04b442c3261 /core/mixer/mixer_sse.cpp
parent0de7ea42fa197833bff70b4c370ed29f9859889d (diff)
Use an interpolated FIR filter for cubic resampling
Similar to how the bsinc filters work, but optimized for 4-point filtering. At least the SSE version is notably faster than calculating the coefficients in real time.
Diffstat (limited to 'core/mixer/mixer_sse.cpp')
-rw-r--r--core/mixer/mixer_sse.cpp51
1 files changed, 45 insertions, 6 deletions
diff --git a/core/mixer/mixer_sse.cpp b/core/mixer/mixer_sse.cpp
index 1b0d1386..4a31a0f1 100644
--- a/core/mixer/mixer_sse.cpp
+++ b/core/mixer/mixer_sse.cpp
@@ -7,10 +7,12 @@
#include "alnumeric.h"
#include "core/bsinc_defs.h"
+#include "core/cubic_defs.h"
#include "defs.h"
#include "hrtfbase.h"
struct SSETag;
+struct CubicTag;
struct BSincTag;
struct FastBSincTag;
@@ -21,8 +23,13 @@ struct FastBSincTag;
namespace {
-constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits};
-constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff};
+constexpr uint BSincPhaseBitDiff{MixerFracBits - BSincPhaseBits};
+constexpr uint BSincPhaseDiffOne{1 << BSincPhaseBitDiff};
+constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
+
+constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits};
+constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff};
+constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
@@ -147,6 +154,38 @@ force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT
} // namespace
template<>
+float *Resample_<CubicTag,SSETag>(const InterpState *state, float *RESTRICT src, uint frac,
+ uint increment, const al::span<float> dst)
+{
+ const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter);
+
+ src -= 1;
+ for(float &out_sample : dst)
+ {
+ const uint pi{frac >> CubicPhaseBitDiff};
+ const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
+ const __m128 pf4{_mm_set1_ps(pf)};
+
+ /* Apply the phase interpolated filter. */
+
+ /* f = fil + pf*phd */
+ const __m128 f4 = MLA4(_mm_load_ps(filter[pi].mCoeffs), pf4,
+ _mm_load_ps(filter[pi].mDeltas));
+ /* r = f*src */
+ __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))};
+
+ r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+ r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+ out_sample = _mm_cvtss_f32(r4);
+
+ frac += increment;
+ src += frac>>MixerFracBits;
+ frac &= MixerFracMask;
+ }
+ return dst.data();
+}
+
+template<>
float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src, uint frac,
uint increment, const al::span<float> dst)
{
@@ -159,8 +198,8 @@ float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src,
for(float &out_sample : dst)
{
// Calculate the phase index and factor.
- const uint pi{frac >> FracPhaseBitDiff};
- const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
+ const uint pi{frac >> BSincPhaseBitDiff};
+ const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
// Apply the scale and phase interpolated filter.
__m128 r4{_mm_setzero_ps()};
@@ -206,8 +245,8 @@ float *Resample_<FastBSincTag,SSETag>(const InterpState *state, float *RESTRICT
for(float &out_sample : dst)
{
// Calculate the phase index and factor.
- const uint pi{frac >> FracPhaseBitDiff};
- const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
+ const uint pi{frac >> BSincPhaseBitDiff};
+ const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
// Apply the phase interpolated filter.
__m128 r4{_mm_setzero_ps()};