2 files changed, 33 insertions, 57 deletions
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index dfd11b37..28999e09 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -48,25 +48,19 @@ using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;
  * signal of length N*2 - 1, and this holds true regardless of the convolution
  * being applied in the frequency domain, so these "overflow" samples need to
  * be accounted for.
- *
- * To avoid a delay with gathering enough input samples to apply an FFT with,
- * the first segment is applied directly in the time-domain as the samples come
- * in. Once enough have been retrieved, the FFT is applied on the input and
- * it's paired with the remaining (FFT'd) filter segments for processing.
  */
 template<size_t N>
-struct SplitFilter {
+struct SegmentedFilter {
     static constexpr size_t sFftLength{256};
     static constexpr size_t sSampleLength{sFftLength / 2};
-    static constexpr size_t sNumSegments{N/sSampleLength - 1};
+    static constexpr size_t sNumSegments{N/sSampleLength};
     static_assert(N >= sFftLength);
     static_assert((N % sSampleLength) == 0);
 
-    PhaseShifterT<sSampleLength> mPShift{NoInit{}};
     PFFFTSetupPtr mFft;
     alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;
 
-    SplitFilter()
+    SegmentedFilter()
     {
         mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)};
 
@@ -97,36 +91,27 @@ struct SplitFilter {
             fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
         inverse_fft(al::span{fftBuffer.get(), fft_size});
 
-        /* Store the first segment of the filter to apply in the time-domain
-         * (backwards for more efficient processing).
-         */
-        auto fftiter = fftBuffer.get() + sSampleLength - 1;
-        for(float &coeff : mPShift.mCoeffs)
-        {
-            coeff = static_cast<float>(fftiter->real() / double{fft_size});
-            fftiter -= 2;
-        }
-
-        /* The remaining segments of the filter are converted back to the
-         * frequency domain, each on their own (0 stuffed).
+        /* The segments of the filter are converted back to the frequency
+         * domain, each on their own (0 stuffed).
          */
+        auto fftBuffer2 = std::make_unique<complex_d[]>(sFftLength);
         auto fftTmp = al::vector<float,16>(sFftLength);
         float *filter{mFilterData.data()};
         for(size_t s{0};s < sNumSegments;++s)
         {
             for(size_t i{0};i < sSampleLength;++i)
-                fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size};
-            std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{});
-            forward_fft(al::span{fftBuffer.get(), sFftLength});
+                fftBuffer2[i] = fftBuffer[sSampleLength*s + i].real() / double{fft_size};
+            std::fill_n(fftBuffer2.get()+sSampleLength, sSampleLength, complex_d{});
+            forward_fft(al::span{fftBuffer2.get(), sFftLength});
 
             /* Convert to zdomain data for PFFFT, scaled by the FFT length so
              * the iFFT result will be normalized.
              */
             for(size_t i{0};i < sSampleLength;++i)
             {
-                fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
-                fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
-                    : fftBuffer[i].imag()) / float{sFftLength};
+                fftTmp[i*2 + 0] = static_cast<float>(fftBuffer2[i].real()) / float{sFftLength};
+                fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer2[sSampleLength].real()
+                    : fftBuffer2[i].imag()) / float{sFftLength};
             }
             pffft_zreorder(mFft.get(), fftTmp.data(), filter, PFFFT_BACKWARD);
             filter += sFftLength;
@@ -135,7 +120,7 @@ struct SplitFilter {
 };
 
 template<size_t N>
-const SplitFilter<N> gSplitFilter;
+const SegmentedFilter<N> gSegmentedFilter;
 
 template<size_t N>
 const PhaseShifterT<N> PShifter;
@@ -212,7 +197,7 @@ template<size_t N>
 void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
 {
-    static constexpr auto &Filter = gSplitFilter<N>;
+    static constexpr auto &Filter = gSegmentedFilter<N>;
     static_assert(sFftLength == Filter.sFftLength);
     static_assert(sSegmentSize == Filter.sSampleLength);
     static_assert(sNumSegments == Filter.sNumSegments);
@@ -237,24 +222,16 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
     {
         const size_t todo{minz(sSegmentSize-mFifoPos, SamplesToDo-base)};
 
-        /* Transform the non-delayed input and store in the later half of the
+        /* Copy out the samples that were previously processed by the FFT. */
+        std::copy_n(mWXInOut.begin()+mFifoPos, todo, mD.begin()+base);
+
+        /* Transform the non-delayed input and store in the front half of the
          * filter input.
          */
-        std::transform(winput+base, winput+base+todo, xinput+base,
-            mWXIn.begin()+sSegmentSize + mFifoPos,
+        std::transform(winput+base, winput+base+todo, xinput+base, mWXInOut.begin()+mFifoPos,
             [](const float w, const float x) noexcept -> float
             { return -0.3420201f*w + 0.5098604f*x; });
 
-        /* Apply the first segment of the filter in the time domain. */
-        auto buf_iter = mD.begin() + base;
-        Filter.mPShift.process({buf_iter, todo}, mWXIn.data()+1 + mFifoPos);
-
-        /* Add the other segments of the filter that were previously processed
-         * by the FFT.
-         */
-        auto fifo_iter = mWXOut.begin() + mFifoPos;
-        std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});
-
         mFifoPos += todo;
         base += todo;
 
@@ -262,20 +239,20 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
         if(mFifoPos < sSegmentSize) break;
         mFifoPos = 0;
 
-        /* Move the newest input to the front for the next iteration's history. */
-        std::copy(mWXIn.cbegin()+sSegmentSize, mWXIn.cend(), mWXIn.begin());
-        std::fill(mWXIn.begin()+sSegmentSize, mWXIn.end(), 0.0f);
+        /* Copy the new input to the next history segment, clearing the back
+         * half of the segment, and convert to the frequency domain.
+         */
+        float *input{mWXHistory.data() + curseg*sFftLength};
+        std::copy_n(mWXInOut.begin(), sSegmentSize, input);
+        std::fill_n(input+sSegmentSize, sSegmentSize, 0.0f);
 
-        /* Overwrite the stale/oldest FFT'd segment with the newest input. */
-        pffft_transform(Filter.mFft.get(), mWXIn.data(), mWXHistory.data() + curseg*sFftLength,
-            mWorkData.data(), PFFFT_FORWARD);
+        pffft_transform(Filter.mFft.get(), input, input, mWorkData.data(), PFFFT_FORWARD);
 
         /* Convolve each input segment with its IR filter counterpart (aligned
-         * in time).
+         * in time, from newest to oldest).
          */
         mFftBuffer.fill(0.0f);
         const float *filter{Filter.mFilterData.data()};
-        const float *input{mWXHistory.data() + curseg*sFftLength};
         for(size_t s{curseg};s < sNumSegments;++s)
         {
             pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
@@ -297,9 +274,9 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
             mWorkData.data(), PFFFT_BACKWARD);
 
         for(size_t i{0};i < sSegmentSize;++i)
-            mWXOut[i] = mFftBuffer[i] + mWXOut[sSegmentSize+i];
+            mWXInOut[i] = mFftBuffer[i] + mWXInOut[sSegmentSize+i];
         for(size_t i{0};i < sSegmentSize;++i)
-            mWXOut[sSegmentSize+i] = mFftBuffer[sSegmentSize+i];
+            mWXInOut[sSegmentSize+i] = mFftBuffer[sSegmentSize+i];
 
         /* Shift the input history. */
         curseg = curseg ? (curseg-1) : (sNumSegments-1);
@@ -324,7 +301,7 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
 
         float *inout{al::assume_aligned<16>(buffer)};
         auto inout_end = inout + SamplesToDo;
-        if(SamplesToDo >= sFilterDelay) LIKELY
+        if(SamplesToDo >= sFilterDelay)
         {
             auto delay_end = std::rotate(inout, inout_end - sFilterDelay, inout_end);
             std::swap_ranges(inout, delay_end, distbuf);
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
index f886edbe..348dc7e1 100644
--- a/core/uhjfilter.h
+++ b/core/uhjfilter.h
@@ -51,10 +51,10 @@ struct UhjEncoderBase {
 
 template<size_t N>
 struct UhjEncoder final : public UhjEncoderBase {
-    static constexpr size_t sFilterDelay{N/2};
     static constexpr size_t sFftLength{256};
     static constexpr size_t sSegmentSize{sFftLength/2};
-    static constexpr size_t sNumSegments{N/sSegmentSize - 1};
+    static constexpr size_t sNumSegments{N/sSegmentSize};
+    static constexpr size_t sFilterDelay{N/2 + sSegmentSize};
 
     /* Delays and processing storage for the input signal. */
     alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
@@ -66,8 +66,7 @@ struct UhjEncoder final : public UhjEncoderBase {
 
     /* History and temp storage for the convolution filter. */
     size_t mFifoPos{}, mCurrentSegment{};
-    alignas(16) std::array<float,sFftLength> mWXIn{};
-    alignas(16) std::array<float,sFftLength> mWXOut{};
+    alignas(16) std::array<float,sFftLength> mWXInOut{};
     alignas(16) std::array<float,sFftLength> mFftBuffer{};
     alignas(16) std::array<float,sFftLength> mWorkData{};
     alignas(16) std::array<float,sFftLength*sNumSegments> mWXHistory{};