aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-10-14 11:15:32 -0700
committerChris Robinson <[email protected]>2023-10-14 11:44:32 -0700
commitd157cf4154dd7a078ac534cd6bbd0f762c23eaaf (patch)
tree2be748440b9663fe4565c0de740e14ffe893e731
parentd3d23579add16fbe3c21f2b436457f398f0f5254 (diff)
Use a split filter for the FIR-based UHJ encoders
This applies the all-pass filter in two steps, first as a relatively short time-domain FIR filter, then as a series of frequency domain convolutions (using complex multiplies). Time-domain convolution scales poorly, so larger convolutions benefit from being done in the frequency domain (though the first part is still done in the time domain, to avoid longer delays).
-rw-r--r--common/phase_shifter.h4
-rw-r--r--core/uhjfilter.cpp196
-rw-r--r--core/uhjfilter.h17
3 files changed, 204 insertions, 13 deletions
diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index b9c889c2..e1a83dab 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -15,6 +15,8 @@
#include "alspan.h"
+struct NoInit { };
+
/* Implements a wide-band +90 degree phase-shift. Note that this should be
* given one sample less of a delay (FilterSize/2 - 1) compared to the direct
* signal delay (FilterSize/2) to properly align.
@@ -71,6 +73,8 @@ struct PhaseShifterT {
}
}
+ PhaseShifterT(NoInit) { }
+
void process(al::span<float> dst, const float *RESTRICT src) const;
private:
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index 4e4e99a5..ee5d4b86 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -9,6 +9,7 @@
#include "alcomplex.h"
#include "alnumeric.h"
#include "opthelpers.h"
+#include "pffft.h"
#include "phase_shifter.h"
@@ -18,6 +19,120 @@ UhjQualityType UhjEncodeQuality{UhjQualityType::Default};
namespace {
+struct PFFFTSetupDeleter {
+ void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); }
+};
+using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;
+
+/* Convolution is implemented using a segmented overlap-add method. The filter
+ * response is broken up into multiple segments of 128 samples, and each
+ * segment has an FFT applied with a 256-sample buffer (the latter half left
+ * silent) to get its frequency-domain response.
+ *
+ * Input samples are similarly broken up into 128-sample segments, with an FFT
+ * applied with a 256-sample buffer to each new incoming segment to get its
+ * frequency-domain response. A history of FFT'd input segments is maintained,
+ * equal to the number of filter response segments.
+ *
+ * To apply the convolution, each filter response segment is convolved with its
+ * paired input segment (using complex multiplies, far cheaper than time-domain
+ * FIRs), accumulating into an FFT buffer. The input history is then shifted to
+ * align with later filter response segments for the next input segment.
+ *
+ * An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
+ * sample time-domain response for output, which is split in two halves. The
+ * first half is the 128-sample output, and the second half is a 128-sample
+ * (really, 127) delayed extension, which gets added to the output next time.
+ * Convolving two time-domain responses of length N results in a time-domain
+ * signal of length N*2 - 1, and this holds true regardless of the convolution
+ * being applied in the frequency domain, so these "overflow" samples need to
+ * be accounted for.
+ *
+ * To avoid a delay with gathering enough input samples to apply an FFT with,
+ * the first segment is applied directly in the time-domain as the samples come
+ * in. Once enough have been retrieved, the FFT is applied on the input and
+ * it's paired with the remaining (FFT'd) filter segments for processing.
+ */
+template<size_t N>
+struct SplitFilter {
+ static constexpr size_t sFftLength{256};
+ static constexpr size_t sSampleLength{sFftLength / 2};
+ static constexpr size_t sNumSegments{N/sSampleLength - 1};
+ static_assert(N >= sFftLength);
+ static_assert((N % sSampleLength) == 0);
+
+ PhaseShifterT<sSampleLength> mPShift{NoInit{}};
+ PFFFTSetupPtr mFft;
+ alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;
+
+ SplitFilter()
+ {
+ mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)};
+
+ using complex_d = std::complex<double>;
+ constexpr size_t fft_size{N};
+ constexpr size_t half_size{fft_size / 2};
+
+ /* To set up the filter, we need to generate the desired response.
+ * Start with a pure delay that passes all frequencies through.
+ */
+ auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
+ std::fill_n(fftBuffer.get(), fft_size, complex_d{});
+ fftBuffer[half_size] = 1.0;
+
+ /* Convert to the frequency domain, shift the phase of each bin by +90
+ * degrees, then convert back to the time domain.
+ */
+ forward_fft(al::span{fftBuffer.get(), fft_size});
+ fftBuffer[0] *= std::numeric_limits<double>::epsilon();
+ for(size_t i{1};i < half_size;++i)
+ fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
+ fftBuffer[half_size] *= std::numeric_limits<double>::epsilon();
+ for(size_t i{half_size+1};i < fft_size;++i)
+ fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
+ inverse_fft(al::span{fftBuffer.get(), fft_size});
+
+ /* Store the first segment of the buffer to apply as a time-domain
+ * filter (backwards for more efficient processing).
+ */
+ auto fftiter = fftBuffer.get() + sSampleLength - 1;
+ for(float &coeff : mPShift.mCoeffs)
+ {
+ coeff = static_cast<float>(fftiter->real() / double{fft_size});
+ fftiter -= 2;
+ }
+
+ /* The remaining segments of the filter are converted back to the
+ * frequency domain, each on their own (0 stuffed).
+ */
+ auto fftTmp = std::make_unique<float[]>(sFftLength);
+ float *filter{mFilterData.data()};
+ for(size_t s{0};s < sNumSegments;++s)
+ {
+ for(size_t i{0};i < sSampleLength;++i)
+ fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size};
+ std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{});
+ forward_fft(al::span{fftBuffer.get(), sFftLength});
+
+ /* Convert to zdomain data for PFFFT, scaled by the FFT length so
+ * the iFFT result will be normalized.
+ */
+ for(size_t i{0};i < sSampleLength;++i)
+ {
+ fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
+ fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
+ : fftBuffer[i].imag()) / float{sFftLength};
+ }
+ pffft_zreorder(mFft.get(), fftTmp.get(), filter, PFFFT_BACKWARD);
+ filter += sFftLength;
+ }
+ }
+};
+
+template<size_t N>
+const SplitFilter<N> gSplitFilter;
+
+
const PhaseShifterT<UhjLength256> PShiftLq{};
const PhaseShifterT<UhjLength512> PShiftHq{};
@@ -87,7 +202,9 @@ template<size_t N>
void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
{
- const auto &PShift = GetPhaseShifter<N>::Get();
+ static constexpr auto &Filter = gSplitFilter<N>;
+ static_assert(sFftLength == Filter.sFftLength);
+ static_assert(sNumSegments == Filter.sNumSegments);
ASSUME(SamplesToDo > 0);
@@ -104,10 +221,78 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i];
/* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
- std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset,
- [](const float w, const float x) noexcept -> float
- { return -0.3420201f*w + 0.5098604f*x; });
- PShift.process({mD.data(), SamplesToDo}, mWX.data());
+ for(size_t base{0};base < SamplesToDo;)
+ {
+ const size_t todo{minz(Filter.sSampleLength-mFifoPos, SamplesToDo-base)};
+
+ /* Transform the non-delayed input and store in the later half of the
+ * filter input.
+ */
+ std::transform(winput+base, winput+base+todo, xinput+base,
+ mWXIn.begin()+Filter.sSampleLength + mFifoPos,
+ [](const float w, const float x) noexcept -> float
+ { return -0.3420201f*w + 0.5098604f*x; });
+
+ /* Apply the first segment of the filter in the time domain. */
+ auto buf_iter = mD.begin() + base;
+ Filter.mPShift.process({buf_iter, todo}, mWXIn.begin()+1 + mFifoPos);
+
+ /* Add the other segments of the filter that were previously processed
+ * by the FFT.
+ */
+ auto fifo_iter = mWXOut.begin() + mFifoPos;
+ std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});
+
+ mFifoPos += todo;
+ base += todo;
+
+ /* Check whether the input buffer is filled with new samples. */
+ if(mFifoPos < Filter.sSampleLength) break;
+ mFifoPos = 0;
+
+ /* Move the newest input to the front for the next iteration's history. */
+ std::copy(mWXIn.cbegin()+Filter.sSampleLength, mWXIn.cend(), mWXIn.begin());
+ std::fill(mWXIn.begin()+Filter.sSampleLength, mWXIn.end(), 0.0f);
+
+ /* Overwrite the stale/oldest FFT'd segment with the newest input. */
+ const size_t curseg{mCurrentSegment};
+ pffft_transform(Filter.mFft.get(), mWXIn.data(),
+ mWXHistory.data() + curseg*Filter.sFftLength, mWorkData.data(), PFFFT_FORWARD);
+
+ /* Convolve each input segment with its IR filter counterpart (aligned
+ * in time).
+ */
+ mFftBuffer.fill(0.0f);
+ const float *filter{Filter.mFilterData.data()};
+ const float *input{mWXHistory.data() + curseg*Filter.sFftLength};
+ for(size_t s{curseg};s < sNumSegments;++s)
+ {
+ pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+ input += Filter.sFftLength;
+ filter += Filter.sFftLength;
+ }
+ input = mWXHistory.data();
+ for(size_t s{0};s < curseg;++s)
+ {
+ pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
+ input += Filter.sFftLength;
+ filter += Filter.sFftLength;
+ }
+
+ /* Convert back to samples, writing to the output and storing the extra
+ * for next time.
+ */
+ pffft_transform(Filter.mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
+ mWorkData.data(), PFFFT_BACKWARD);
+
+ for(size_t i{0};i < Filter.sSampleLength;++i)
+ mWXOut[i] = mFftBuffer[i] + mWXOut[Filter.sSampleLength+i];
+ for(size_t i{0};i < Filter.sSampleLength;++i)
+ mWXOut[Filter.sSampleLength+i] = mFftBuffer[Filter.sSampleLength+i];
+
+ /* Shift the input history. */
+ mCurrentSegment = curseg ? (curseg-1) : (sNumSegments-1);
+ }
/* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
for(size_t i{0};i < SamplesToDo;++i)
@@ -117,7 +302,6 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
- std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin());
/* Apply a delay to the existing output to align with the input delay. */
auto *delayBuffer = mDirectDelay.data();
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
index df308094..b8b1e96c 100644
--- a/core/uhjfilter.h
+++ b/core/uhjfilter.h
@@ -51,6 +51,9 @@ struct UhjEncoderBase {
template<size_t N>
struct UhjEncoder final : public UhjEncoderBase {
static constexpr size_t sFilterDelay{N/2};
+ static constexpr size_t sFftLength{256};
+ static constexpr size_t sSegmentSize{sFftLength/2};
+ static constexpr size_t sNumSegments{N/sSegmentSize - 1};
/* Delays and processing storage for the input signal. */
alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
@@ -60,11 +63,13 @@ struct UhjEncoder final : public UhjEncoderBase {
alignas(16) std::array<float,BufferLineSize> mS{};
alignas(16) std::array<float,BufferLineSize> mD{};
- /* History and temp storage for the FIR filter. New samples should be
- * written to index sFilterDelay*2 - 1.
- */
- static constexpr size_t sWXInOffset{sFilterDelay*2 - 1};
- alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mWX{};
+ /* History and temp storage for the convolution filter. */
+ size_t mFifoPos{}, mCurrentSegment{};
+ alignas(16) std::array<float,sFftLength> mWXIn{};
+ alignas(16) std::array<float,sFftLength> mWXOut{};
+ alignas(16) std::array<float,sFftLength> mFftBuffer{};
+ alignas(16) std::array<float,sFftLength> mWorkData{};
+ alignas(16) std::array<float,sFftLength*sNumSegments> mWXHistory{};
alignas(16) std::array<std::array<float,sFilterDelay>,2> mDirectDelay{};
@@ -77,8 +82,6 @@ struct UhjEncoder final : public UhjEncoderBase {
*/
void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
const size_t SamplesToDo) override;
-
- DEF_NEWDEL(UhjEncoder)
};
struct UhjEncoderIIR final : public UhjEncoderBase {