From 3ec03cadd2b5059e54e5e9b8f4d506b4c6ce727d Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Fri, 5 May 2023 06:46:00 -0700
Subject: Use deduction guides instead of helper functions for spans

---
 common/phase_shifter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'common/phase_shifter.h')
diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index 0d4166bc..061e9176 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -53,12 +53,12 @@ struct PhaseShifterT {
         std::fill_n(fftBuffer.get(), fft_size, complex_d{});
         fftBuffer[half_size] = 1.0;
 
-        forward_fft(al::as_span(fftBuffer.get(), fft_size));
+        forward_fft(al::span{fftBuffer.get(), fft_size});
         for(size_t i{0};i < half_size+1;++i)
             fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
         for(size_t i{half_size+1};i < fft_size;++i)
             fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
-        inverse_fft(al::as_span(fftBuffer.get(), fft_size));
+        inverse_fft(al::span{fftBuffer.get(), fft_size});
 
         auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1);
         for(float &coeff : mCoeffs)
-- 
cgit v1.2.3


From 23cc00ea16bdfbb06ae49cde0e05db6ec4a07100 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 3 Oct 2023 22:36:01 -0700
Subject: Clear the 0 and half-frequency bins for the phase shift filter

This doesn't change the filter response, but is more correct since a real
signal won't have an imaginary value on them (it can only have a magnitude with
a phase of 0 or pi).
---
 common/phase_shifter.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'common/phase_shifter.h')

diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index 061e9176..6b0ad512 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -9,6 +9,7 @@
 
 #include <array>
 #include <stddef.h>
+#include <type_traits>
 
 #include "alcomplex.h"
 #include "alspan.h"
@@ -54,13 +55,15 @@ struct PhaseShifterT {
         fftBuffer[half_size] = 1.0;
 
         forward_fft(al::span{fftBuffer.get(), fft_size});
-        for(size_t i{0};i < half_size+1;++i)
+        fftBuffer[0] *= std::numeric_limits<double>::epsilon();
+        for(size_t i{1};i < half_size;++i)
             fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
+        fftBuffer[half_size] *= std::numeric_limits<double>::epsilon();
         for(size_t i{half_size+1};i < fft_size;++i)
             fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
         inverse_fft(al::span{fftBuffer.get(), fft_size});
 
-        auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1);
+        auto fftiter = fftBuffer.get() + fft_size - 1;
         for(float &coeff : mCoeffs)
         {
             coeff = static_cast<float>(fftiter->real() / double{fft_size});
-- 
cgit v1.2.3


From 1614fccd9fd893e104dcca2c92b83b2a7bfaa0c7 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Fri, 6 Oct 2023 18:56:47 -0700
Subject: Improve NEON shuffling

---
 common/phase_shifter.h | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

(limited to 'common/phase_shifter.h')

diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index 6b0ad512..b9c889c2 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -75,25 +75,6 @@ struct PhaseShifterT {
 
 private:
 #if defined(HAVE_NEON)
-    /* There doesn't seem to be NEON intrinsics to do this kind of stipple
-     * shuffling, so there's two custom methods for it.
-     */
-    static auto shuffle_2020(float32x4_t a, float32x4_t b)
-    {
-        float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
-        ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
-        ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
-        ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
-        return ret;
-    }
-    static auto shuffle_3131(float32x4_t a, float32x4_t b)
-    {
-        float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
-        ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
-        ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
-        ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
-        return ret;
-    }
     static auto unpacklo(float32x4_t a, float32x4_t b)
     {
         float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
@@ -174,9 +155,10 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT
                 const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
                 const float32x4_t s0{vld1q_f32(&src[j*2])};
                 const float32x4_t s1{vld1q_f32(&src[j*2 + 4])};
+                const float32x4x2_t values{vuzpq_f32(s0, s1)};
 
-                r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs);
-                r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs);
+                r04 = vmlaq_f32(r04, values.val[0], coeffs);
+                r14 = vmlaq_f32(r14, values.val[1], coeffs);
             }
             src += 2;
 
-- 
cgit v1.2.3


From d157cf4154dd7a078ac534cd6bbd0f762c23eaaf Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sat, 14 Oct 2023 11:15:32 -0700
Subject: Use a split filter for the FIR-based UHJ encoders

This applies the all-pass filter in two steps, first as a relatively short
time-domain FIR filter, then as a series of frequency domain convolutions
(using complex multiplies). Time-domain convolution scales poorly, so larger
convolutions benefit from being done in the frequency domain (though the first
part is still done in the time domain, to avoid longer delays).
---
 common/phase_shifter.h |   4 +
 core/uhjfilter.cpp     | 196 +++++++++++++++++++++++++++++++++++++++++++++++--
 core/uhjfilter.h       |  17 +++--
 3 files changed, 204 insertions(+), 13 deletions(-)

(limited to 'common/phase_shifter.h')

diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index b9c889c2..e1a83dab 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -15,6 +15,8 @@
 #include "alspan.h"
 
 
+struct NoInit { };
+
 /* Implements a wide-band +90 degree phase-shift. Note that this should be
  * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
  * signal delay (FilterSize/2) to properly align.
@@ -71,6 +73,8 @@ struct PhaseShifterT {
         }
     }
 
+    PhaseShifterT(NoInit) { }
+
     void process(al::span<float> dst, const float *RESTRICT src) const;
 
 private:
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index 4e4e99a5..ee5d4b86 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -9,6 +9,7 @@
 #include "alcomplex.h"
 #include "alnumeric.h"
 #include "opthelpers.h"
+#include "pffft.h"
 #include "phase_shifter.h"
 
 
@@ -18,6 +19,120 @@ UhjQualityType UhjEncodeQuality{UhjQualityType::Default};
 
 namespace {
 
+struct PFFFTSetupDeleter {
+    void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); }
+};
+using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;
+
+/* Convolution is implemented using a segmented overlap-add method. The filter
+ * response is broken up into multiple segments of 128 samples, and each
+ * segment has an FFT applied with a 256-sample buffer (the latter half left
+ * silent) to get its frequency-domain response.
+ *
+ * Input samples are similarly broken up into 128-sample segments, with an FFT
+ * applied with a 256-sample buffer to each new incoming segment to get its
+ * frequency-domain response. A history of FFT'd input segments is maintained,
+ * equal to the number of filter response segments.
+ *
+ * To apply the convolution, each filter response segment is convolved with its
+ * paired input segment (using complex multiplies, far cheaper than time-domain
+ * FIRs), accumulating into an FFT buffer. The input history is then shifted to
+ * align with later filter response segments for the next input segment.
+ *
+ * An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
+ * sample time-domain response for output, which is split in two halves. The
+ * first half is the 128-sample output, and the second half is a 128-sample
+ * (really, 127) delayed extension, which gets added to the output next time.
+ * Convolving two time-domain responses of length N results in a time-domain
+ * signal of length N*2 - 1, and this holds true regardless of the convolution
+ * being applied in the frequency domain, so these "overflow" samples need to
+ * be accounted for.
+ *
+ * To avoid a delay with gathering enough input samples to apply an FFT with,
+ * the first segment is applied directly in the time-domain as the samples come
+ * in. Once enough have been retrieved, the FFT is applied on the input and
+ * it's paired with the remaining (FFT'd) filter segments for processing.
+ */
+template<size_t N>
+struct SplitFilter {
+    static constexpr size_t sFftLength{256};
+    static constexpr size_t sSampleLength{sFftLength / 2};
+    static constexpr size_t sNumSegments{N/sSampleLength - 1};
+    static_assert(N >= sFftLength);
+    static_assert((N % sSampleLength) == 0);
+
+    PhaseShifterT<sSampleLength> mPShift{NoInit{}};
+    PFFFTSetupPtr mFft;
+    alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;
+
+    SplitFilter()
+    {
+        mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)};
+
+        using complex_d = std::complex<double>;
+        constexpr size_t fft_size{N};
+        constexpr size_t half_size{fft_size / 2};
+
+        /* To set up the filter, we need to generate the desired response.
+         * Start with a pure delay that passes all frequencies through.
+         */
+        auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
+        std::fill_n(fftBuffer.get(), fft_size, complex_d{});
+        fftBuffer[half_size] = 1.0;
+
+        /* Convert to the frequency domain, shift the phase of each bin by +90
+         * degrees, then convert back to the time domain.
+         */
+        forward_fft(al::span{fftBuffer.get(), fft_size});
+        fftBuffer[0] *= std::numeric_limits<double>::epsilon();
+        for(size_t i{1};i < half_size;++i)
+            fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
+        fftBuffer[half_size] *= std::numeric_limits<double>::epsilon();
+        for(size_t i{half_size+1};i < fft_size;++i)
+            fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
+        inverse_fft(al::span{fftBuffer.get(), fft_size});
+
+        /* Store the first segment of the buffer to apply as a time-domain
+         * filter (backwards for more efficient processing).
+         */
+        auto fftiter = fftBuffer.get() + sSampleLength - 1;
+        for(float &coeff : mPShift.mCoeffs)
+        {
+            coeff = static_cast<float>(fftiter->real() / double{fft_size});
+            fftiter -= 2;
+        }
+
+        /* The remaining segments of the filter are converted back to the
+         * frequency domain, each on their own (0 stuffed).
+         */
+        auto fftTmp = std::make_unique<float[]>(sFftLength);
+        float *filter{mFilterData.data()};
+        for(size_t s{0};s < sNumSegments;++s)
+        {
+            for(size_t i{0};i < sSampleLength;++i)
+                fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size};
+            std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{});
+            forward_fft(al::span{fftBuffer.get(), sFftLength});
+
+            /* Convert to zdomain data for PFFFT, scaled by the FFT length so
+             * the iFFT result will be normalized.
+             */
+            for(size_t i{0};i < sSampleLength;++i)
+            {
+                fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
+                fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
+                    : fftBuffer[i].imag()) / float{sFftLength};
+            }
+            pffft_zreorder(mFft.get(), fftTmp.get(), filter, PFFFT_BACKWARD);
+            filter += sFftLength;
+        }
+    }
+};
+
+template<size_t N>
+const SplitFilter<N> gSplitFilter;
+
+
 const PhaseShifterT<UhjLength256> PShiftLq{};
 const PhaseShifterT<UhjLength512> PShiftHq{};
 
@@ -87,7 +202,9 @@ template<size_t N>
 void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
 {
-    const auto &PShift = GetPhaseShifter<N>::Get();
+    static constexpr auto &Filter = gSplitFilter<N>;
+    static_assert(sFftLength == Filter.sFftLength);
+    static_assert(sNumSegments == Filter.sNumSegments);
 
     ASSUME(SamplesToDo > 0);
 
@@ -104,10 +221,78 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
         mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i];
 
     /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
-    std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset,
-        [](const float w, const float x) noexcept -> float
-        { return -0.3420201f*w + 0.5098604f*x; });
-    PShift.process({mD.data(), SamplesToDo}, mWX.data());
+    for(size_t base{0};base < SamplesToDo;)
+    {
+        const size_t todo{minz(Filter.sSampleLength-mFifoPos, SamplesToDo-base)};
+
+        /* Transform the non-delayed input and store in the later half of the
+         * filter input.
+         */
+        std::transform(winput+base, winput+base+todo, xinput+base,
+            mWXIn.begin()+Filter.sSampleLength + mFifoPos,
+            [](const float w, const float x) noexcept -> float
+            { return -0.3420201f*w + 0.5098604f*x; });
+
+        /* Apply the first segment of the filter in the time domain. */
+        auto buf_iter = mD.begin() + base;
+        Filter.mPShift.process({buf_iter, todo}, mWXIn.begin()+1 + mFifoPos);
+
+        /* Add the other segments of the filter that were previously processed
+         * by the FFT.
+         */
+        auto fifo_iter = mWXOut.begin() + mFifoPos;
+        std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});
+
+        mFifoPos += todo;
+        base += todo;
+
+        /* Check whether the input buffer is filled with new samples. */
+        if(mFifoPos < Filter.sSampleLength) break;
+        mFifoPos = 0;
+
+        /* Move the newest input to the front for the next iteration's history. */
+        std::copy(mWXIn.cbegin()+Filter.sSampleLength, mWXIn.cend(), mWXIn.begin());
+        std::fill(mWXIn.begin()+Filter.sSampleLength, mWXIn.end(), 0.0f);
+
+        /* Overwrite the stale/oldest FFT'd segment with the newest input. */
+        const size_t curseg{mCurrentSegment};
+        pffft_transform(Filter.mFft.get(), mWXIn.data(),
+            mWXHistory.data() + curseg*Filter.sFftLength, mWorkData.data(), PFFFT_FORWARD);
+
+        /* Convolve each input segment with its IR filter counterpart (aligned
+         * in time).
+         */
+        mFftBuffer.fill(0.0f);
+        const float *filter{Filter.mFilterData.data()};
+        const float *input{mWXHistory.data() + curseg*Filter.sFftLength};
+        for(size_t s{curseg};s < sNumSegments;++s)
+        {
+            pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+            input += Filter.sFftLength;
+            filter += Filter.sFftLength;
+        }
+        input = mWXHistory.data();
+        for(size_t s{0};s < curseg;++s)
+        {
+            pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+            input += Filter.sFftLength;
+            filter += Filter.sFftLength;
+        }
+
+        /* Convert back to samples, writing to the output and storing the extra
+         * for next time.
+         */
+        pffft_transform(Filter.mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
+            mWorkData.data(), PFFFT_BACKWARD);
+
+        for(size_t i{0};i < Filter.sSampleLength;++i)
+            mWXOut[i] = mFftBuffer[i] + mWXOut[Filter.sSampleLength+i];
+        for(size_t i{0};i < Filter.sSampleLength;++i)
+            mWXOut[Filter.sSampleLength+i] = mFftBuffer[Filter.sSampleLength+i];
+
+        /* Shift the input history. */
+        mCurrentSegment = curseg ? (curseg-1) : (sNumSegments-1);
+    }
 
     /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
     for(size_t i{0};i < SamplesToDo;++i)
@@ -117,7 +302,6 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
     std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
     std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
-    std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin());
 
     /* Apply a delay to the existing output to align with the input delay. */
     auto *delayBuffer = mDirectDelay.data();
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
index df308094..b8b1e96c 100644
--- a/core/uhjfilter.h
+++ b/core/uhjfilter.h
@@ -51,6 +51,9 @@ struct UhjEncoderBase {
 template<size_t N>
 struct UhjEncoder final : public UhjEncoderBase {
     static constexpr size_t sFilterDelay{N/2};
+    static constexpr size_t sFftLength{256};
+    static constexpr size_t sSegmentSize{sFftLength/2};
+    static constexpr size_t sNumSegments{N/sSegmentSize - 1};
 
     /* Delays and processing storage for the input signal. */
     alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
@@ -60,11 +63,13 @@ struct UhjEncoder final : public UhjEncoderBase {
     alignas(16) std::array<float,BufferLineSize> mS{};
     alignas(16) std::array<float,BufferLineSize> mD{};
 
-    /* History and temp storage for the FIR filter. New samples should be
-     * written to index sFilterDelay*2 - 1.
-     */
-    static constexpr size_t sWXInOffset{sFilterDelay*2 - 1};
-    alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mWX{};
+    /* History and temp storage for the convolution filter. */
+    size_t mFifoPos{}, mCurrentSegment{};
+    alignas(16) std::array<float,sFftLength> mWXIn{};
+    alignas(16) std::array<float,sFftLength> mWXOut{};
+    alignas(16) std::array<float,sFftLength> mFftBuffer{};
+    alignas(16) std::array<float,sFftLength> mWorkData{};
+    alignas(16) std::array<float,sFftLength*sNumSegments> mWXHistory{};
 
     alignas(16) std::array<std::array<float,sFilterDelay>,2> mDirectDelay{};
 
@@ -77,8 +82,6 @@ struct UhjEncoder final : public UhjEncoderBase {
      */
     void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
         const size_t SamplesToDo) override;
-
-    DEF_NEWDEL(UhjEncoder)
 };
 
 struct UhjEncoderIIR final : public UhjEncoderBase {
-- 
cgit v1.2.3