Use a split filter for the FIR-based UHJ encoders

This applies the all-pass filter in two steps, first as a relatively short time-domain FIR filter, then as a series of frequency domain convolutions (using complex multiplies). Time-domain convolution scales poorly, so larger convolutions benefit from being done in the frequency domain (though the first part is still done in the time domain, to avoid longer delays).
author: Chris Robinson <[email protected]> 2023-10-14 11:15:32 -0700
committer: Chris Robinson <[email protected]> 2023-10-14 11:44:32 -0700
commit: d157cf4154dd7a078ac534cd6bbd0f762c23eaaf (patch)
tree: 2be748440b9663fe4565c0de740e14ffe893e731
parent: d3d23579add16fbe3c21f2b436457f398f0f5254 (diff)
3 files changed, 204 insertions, 13 deletions
diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index b9c889c2..e1a83dab 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -15,6 +15,8 @@
 #include "alspan.h"
 
 
+struct NoInit { };
+
 /* Implements a wide-band +90 degree phase-shift. Note that this should be
  * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
  * signal delay (FilterSize/2) to properly align.
@@ -71,6 +73,8 @@ struct PhaseShifterT {
         }
     }
 
+    PhaseShifterT(NoInit) { }
+
     void process(al::span<float> dst, const float *RESTRICT src) const;
 
 private:
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index 4e4e99a5..ee5d4b86 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -9,6 +9,7 @@
 #include "alcomplex.h"
 #include "alnumeric.h"
 #include "opthelpers.h"
+#include "pffft.h"
 #include "phase_shifter.h"
 
 
@@ -18,6 +19,120 @@ UhjQualityType UhjEncodeQuality{UhjQualityType::Default};
 
 namespace {
 
+struct PFFFTSetupDeleter {
+    void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); }
+};
+using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;
+
+/* Convolution is implemented using a segmented overlap-add method. The filter
+ * response is broken up into multiple segments of 128 samples, and each
+ * segment has an FFT applied with a 256-sample buffer (the latter half left
+ * silent) to get its frequency-domain response.
+ *
+ * Input samples are similarly broken up into 128-sample segments, with an FFT
+ * applied with a 256-sample buffer to each new incoming segment to get its
+ * frequency-domain response. A history of FFT'd input segments is maintained,
+ * equal to the number of filter response segments.
+ *
+ * To apply the convolution, each filter response segment is convolved with its
+ * paired input segment (using complex multiplies, far cheaper than time-domain
+ * FIRs), accumulating into an FFT buffer. The input history is then shifted to
+ * align with later filter response segments for the next input segment.
+ *
+ * An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
+ * sample time-domain response for output, which is split in two halves. The
+ * first half is the 128-sample output, and the second half is a 128-sample
+ * (really, 127) delayed extension, which gets added to the output next time.
+ * Convolving two time-domain responses of length N results in a time-domain
+ * signal of length N*2 - 1, and this holds true regardless of the convolution
+ * being applied in the frequency domain, so these "overflow" samples need to
+ * be accounted for.
+ *
+ * To avoid a delay with gathering enough input samples to apply an FFT with,
+ * the first segment is applied directly in the time-domain as the samples come
+ * in. Once enough have been retrieved, the FFT is applied on the input and
+ * it's paired with the remaining (FFT'd) filter segments for processing.
+ */
+template<size_t N>
+struct SplitFilter {
+    static constexpr size_t sFftLength{256};
+    static constexpr size_t sSampleLength{sFftLength / 2};
+    static constexpr size_t sNumSegments{N/sSampleLength - 1};
+    static_assert(N >= sFftLength);
+    static_assert((N % sSampleLength) == 0);
+
+    PhaseShifterT<sSampleLength> mPShift{NoInit{}};
+    PFFFTSetupPtr mFft;
+    alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;
+
+    SplitFilter()
+    {
+        mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)};
+
+        using complex_d = std::complex<double>;
+        constexpr size_t fft_size{N};
+        constexpr size_t half_size{fft_size / 2};
+
+        /* To set up the filter, we need to generate the desired response.
+         * Start with a pure delay that passes all frequencies through.
+         */
+        auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
+        std::fill_n(fftBuffer.get(), fft_size, complex_d{});
+        fftBuffer[half_size] = 1.0;
+
+        /* Convert to the frequency domain, shift the phase of each bin by +90
+         * degrees, then convert back to the time domain.
+         */
+        forward_fft(al::span{fftBuffer.get(), fft_size});
+        fftBuffer[0] *= std::numeric_limits<double>::epsilon();
+        for(size_t i{1};i < half_size;++i)
+            fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
+        fftBuffer[half_size] *= std::numeric_limits<double>::epsilon();
+        for(size_t i{half_size+1};i < fft_size;++i)
+            fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
+        inverse_fft(al::span{fftBuffer.get(), fft_size});
+
+        /* Store the first segment of the buffer to apply as a time-domain
+         * filter (backwards for more efficient processing).
+         */
+        auto fftiter = fftBuffer.get() + sSampleLength - 1;
+        for(float &coeff : mPShift.mCoeffs)
+        {
+            coeff = static_cast<float>(fftiter->real() / double{fft_size});
+            fftiter -= 2;
+        }
+
+        /* The remaining segments of the filter are converted back to the
+         * frequency domain, each on their own (0 stuffed).
+         */
+        auto fftTmp = std::make_unique<float[]>(sFftLength);
+        float *filter{mFilterData.data()};
+        for(size_t s{0};s < sNumSegments;++s)
+        {
+            for(size_t i{0};i < sSampleLength;++i)
+                fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size};
+            std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{});
+            forward_fft(al::span{fftBuffer.get(), sFftLength});
+
+            /* Convert to zdomain data for PFFFT, scaled by the FFT length so
+             * the iFFT result will be normalized.
+             */
+            for(size_t i{0};i < sSampleLength;++i)
+            {
+                fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
+                fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
+                    : fftBuffer[i].imag()) / float{sFftLength};
+            }
+            pffft_zreorder(mFft.get(), fftTmp.get(), filter, PFFFT_BACKWARD);
+            filter += sFftLength;
+        }
+    }
+};
+
+template<size_t N>
+const SplitFilter<N> gSplitFilter;
+
+
 const PhaseShifterT<UhjLength256> PShiftLq{};
 const PhaseShifterT<UhjLength512> PShiftHq{};
 
@@ -87,7 +202,9 @@ template<size_t N>
 void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
 {
-    const auto &PShift = GetPhaseShifter<N>::Get();
+    static constexpr auto &Filter = gSplitFilter<N>;
+    static_assert(sFftLength == Filter.sFftLength);
+    static_assert(sNumSegments == Filter.sNumSegments);
 
     ASSUME(SamplesToDo > 0);
 
@@ -104,10 +221,78 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
         mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i];
 
     /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
-    std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset,
-        [](const float w, const float x) noexcept -> float
-        { return -0.3420201f*w + 0.5098604f*x; });
-    PShift.process({mD.data(), SamplesToDo}, mWX.data());
+    for(size_t base{0};base < SamplesToDo;)
+    {
+        const size_t todo{minz(Filter.sSampleLength-mFifoPos, SamplesToDo-base)};
+
+        /* Transform the non-delayed input and store in the later half of the
+         * filter input.
+         */
+        std::transform(winput+base, winput+base+todo, xinput+base,
+            mWXIn.begin()+Filter.sSampleLength + mFifoPos,
+            [](const float w, const float x) noexcept -> float
+            { return -0.3420201f*w + 0.5098604f*x; });
+
+        /* Apply the first segment of the filter in the time domain. */
+        auto buf_iter = mD.begin() + base;
+        Filter.mPShift.process({buf_iter, todo}, mWXIn.begin()+1 + mFifoPos);
+
+        /* Add the other segments of the filter that were previously processed
+         * by the FFT.
+         */
+        auto fifo_iter = mWXOut.begin() + mFifoPos;
+        std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});
+
+        mFifoPos += todo;
+        base += todo;
+
+        /* Check whether the input buffer is filled with new samples. */
+        if(mFifoPos < Filter.sSampleLength) break;
+        mFifoPos = 0;
+
+        /* Move the newest input to the front for the next iteration's history. */
+        std::copy(mWXIn.cbegin()+Filter.sSampleLength, mWXIn.cend(), mWXIn.begin());
+        std::fill(mWXIn.begin()+Filter.sSampleLength, mWXIn.end(), 0.0f);
+
+        /* Overwrite the stale/oldest FFT'd segment with the newest input. */
+        const size_t curseg{mCurrentSegment};
+        pffft_transform(Filter.mFft.get(), mWXIn.data(),
+            mWXHistory.data() + curseg*Filter.sFftLength, mWorkData.data(), PFFFT_FORWARD);
+
+        /* Convolve each input segment with its IR filter counterpart (aligned
+         * in time).
+         */
+        mFftBuffer.fill(0.0f);
+        const float *filter{Filter.mFilterData.data()};
+        const float *input{mWXHistory.data() + curseg*Filter.sFftLength};
+        for(size_t s{curseg};s < sNumSegments;++s)
+        {
+            pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+            input += Filter.sFftLength;
+            filter += Filter.sFftLength;
+        }
+        input = mWXHistory.data();
+        for(size_t s{0};s < curseg;++s)
+        {
+            pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+            input += Filter.sFftLength;
+            filter += Filter.sFftLength;
+        }
+
+        /* Convert back to samples, writing to the output and storing the extra
+         * for next time.
+         */
+        pffft_transform(Filter.mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
+            mWorkData.data(), PFFFT_BACKWARD);
+
+        for(size_t i{0};i < Filter.sSampleLength;++i)
+            mWXOut[i] = mFftBuffer[i] + mWXOut[Filter.sSampleLength+i];
+        for(size_t i{0};i < Filter.sSampleLength;++i)
+            mWXOut[Filter.sSampleLength+i] = mFftBuffer[Filter.sSampleLength+i];
+
+        /* Shift the input history. */
+        mCurrentSegment = curseg ? (curseg-1) : (sNumSegments-1);
+    }
 
     /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
     for(size_t i{0};i < SamplesToDo;++i)
@@ -117,7 +302,6 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
     std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
     std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
-    std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin());
 
     /* Apply a delay to the existing output to align with the input delay. */
     auto *delayBuffer = mDirectDelay.data();
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
index df308094..b8b1e96c 100644
--- a/core/uhjfilter.h
+++ b/core/uhjfilter.h
@@ -51,6 +51,9 @@ struct UhjEncoderBase {
 template<size_t N>
 struct UhjEncoder final : public UhjEncoderBase {
     static constexpr size_t sFilterDelay{N/2};
+    static constexpr size_t sFftLength{256};
+    static constexpr size_t sSegmentSize{sFftLength/2};
+    static constexpr size_t sNumSegments{N/sSegmentSize - 1};
 
     /* Delays and processing storage for the input signal. */
     alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
@@ -60,11 +63,13 @@ struct UhjEncoder final : public UhjEncoderBase {
     alignas(16) std::array<float,BufferLineSize> mS{};
     alignas(16) std::array<float,BufferLineSize> mD{};
 
-    /* History and temp storage for the FIR filter. New samples should be
-     * written to index sFilterDelay*2 - 1.
-     */
-    static constexpr size_t sWXInOffset{sFilterDelay*2 - 1};
-    alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mWX{};
+    /* History and temp storage for the convolution filter. */
+    size_t mFifoPos{}, mCurrentSegment{};
+    alignas(16) std::array<float,sFftLength> mWXIn{};
+    alignas(16) std::array<float,sFftLength> mWXOut{};
+    alignas(16) std::array<float,sFftLength> mFftBuffer{};
+    alignas(16) std::array<float,sFftLength> mWorkData{};
+    alignas(16) std::array<float,sFftLength*sNumSegments> mWXHistory{};
 
     alignas(16) std::array<std::array<float,sFilterDelay>,2> mDirectDelay{};
 
@@ -77,8 +82,6 @@ struct UhjEncoder final : public UhjEncoderBase {
      */
     void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
         const size_t SamplesToDo) override;
-
-    DEF_NEWDEL(UhjEncoder)
 };
 
 struct UhjEncoderIIR final : public UhjEncoderBase {
author	Chris Robinson <[email protected]>	2023-10-14 11:15:32 -0700
committer	Chris Robinson <[email protected]>	2023-10-14 11:44:32 -0700
commit	d157cf4154dd7a078ac534cd6bbd0f762c23eaaf (patch)
tree	2be748440b9663fe4565c0de740e14ffe893e731
parent	d3d23579add16fbe3c21f2b436457f398f0f5254 (diff)