From 3ec03cadd2b5059e54e5e9b8f4d506b4c6ce727d Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Fri, 5 May 2023 06:46:00 -0700 Subject: Use deduction guides instead of helper functions for spans --- common/phase_shifter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'common/phase_shifter.h') diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 0d4166bc..061e9176 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -53,12 +53,12 @@ struct PhaseShifterT { std::fill_n(fftBuffer.get(), fft_size, complex_d{}); fftBuffer[half_size] = 1.0; - forward_fft(al::as_span(fftBuffer.get(), fft_size)); + forward_fft(al::span{fftBuffer.get(), fft_size}); for(size_t i{0};i < half_size+1;++i) fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()}; for(size_t i{half_size+1};i < fft_size;++i) fftBuffer[i] = std::conj(fftBuffer[fft_size - i]); - inverse_fft(al::as_span(fftBuffer.get(), fft_size)); + inverse_fft(al::span{fftBuffer.get(), fft_size}); auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1); for(float &coeff : mCoeffs) -- cgit v1.2.3 From 23cc00ea16bdfbb06ae49cde0e05db6ec4a07100 Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Tue, 3 Oct 2023 22:36:01 -0700 Subject: Clear the 0 and half-frequency bins for the phase shift filter This doesn't change the filter response, but is more correct since a real signal won't have an imaginary value on them (it can only have a magnitude with a phase of 0 or pi). --- common/phase_shifter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'common/phase_shifter.h') diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 061e9176..6b0ad512 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -9,6 +9,7 @@ #include #include +#include #include "alcomplex.h" #include "alspan.h" @@ -54,13 +55,15 @@ struct PhaseShifterT { fftBuffer[half_size] = 1.0; forward_fft(al::span{fftBuffer.get(), fft_size}); - for(size_t i{0};i < half_size+1;++i) + fftBuffer[0] *= std::numeric_limits::epsilon(); + for(size_t i{1};i < half_size;++i) fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()}; + fftBuffer[half_size] *= std::numeric_limits::epsilon(); for(size_t i{half_size+1};i < fft_size;++i) fftBuffer[i] = std::conj(fftBuffer[fft_size - i]); inverse_fft(al::span{fftBuffer.get(), fft_size}); - auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1); + auto fftiter = fftBuffer.get() + fft_size - 1; for(float &coeff : mCoeffs) { coeff = static_cast(fftiter->real() / double{fft_size}); -- cgit v1.2.3 From 1614fccd9fd893e104dcca2c92b83b2a7bfaa0c7 Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Fri, 6 Oct 2023 18:56:47 -0700 Subject: Improve NEON shuffling --- common/phase_shifter.h | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'common/phase_shifter.h') diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 6b0ad512..b9c889c2 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -75,25 +75,6 @@ struct PhaseShifterT { private: #if defined(HAVE_NEON) - /* There doesn't seem to be NEON intrinsics to do this kind of stipple - * shuffling, so there's two custom methods for it. - */ - static auto shuffle_2020(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); - return ret; - } - static auto shuffle_3131(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); - return ret; - } static auto unpacklo(float32x4_t a, float32x4_t b) { float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; @@ -174,9 +155,10 @@ inline void PhaseShifterT::process(al::span dst, const float *RESTRICT const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])}; const float32x4_t s0{vld1q_f32(&src[j*2])}; const float32x4_t s1{vld1q_f32(&src[j*2 + 4])}; + const float32x4x2_t values{vuzpq_f32(s0, s1)}; - r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs); - r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs); + r04 = vmlaq_f32(r04, values.val[0], coeffs); + r14 = vmlaq_f32(r14, values.val[1], coeffs); } src += 2; -- cgit v1.2.3 From d157cf4154dd7a078ac534cd6bbd0f762c23eaaf Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Sat, 14 Oct 2023 11:15:32 -0700 Subject: Use a split filter for the FIR-based UHJ encoders This applies the all-pass filter in two steps, first as a relatively short time-domain FIR filter, then as a series of frequency domain convolutions (using complex multiplies). Time-domain convolution scales poorly, so larger convolutions benefit from being done in the frequency domain (though the first part is still done in the time domain, to avoid longer delays). --- common/phase_shifter.h | 4 + core/uhjfilter.cpp | 196 +++++++++++++++++++++++++++++++++++++++++++++++-- core/uhjfilter.h | 17 +++-- 3 files changed, 204 insertions(+), 13 deletions(-) (limited to 'common/phase_shifter.h') diff --git a/common/phase_shifter.h b/common/phase_shifter.h index b9c889c2..e1a83dab 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -15,6 +15,8 @@ #include "alspan.h" +struct NoInit { }; + /* Implements a wide-band +90 degree phase-shift. Note that this should be * given one sample less of a delay (FilterSize/2 - 1) compared to the direct * signal delay (FilterSize/2) to properly align. @@ -71,6 +73,8 @@ struct PhaseShifterT { } } + PhaseShifterT(NoInit) { } + void process(al::span dst, const float *RESTRICT src) const; private: diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp index 4e4e99a5..ee5d4b86 100644 --- a/core/uhjfilter.cpp +++ b/core/uhjfilter.cpp @@ -9,6 +9,7 @@ #include "alcomplex.h" #include "alnumeric.h" #include "opthelpers.h" +#include "pffft.h" #include "phase_shifter.h" @@ -18,6 +19,120 @@ UhjQualityType UhjEncodeQuality{UhjQualityType::Default}; namespace { +struct PFFFTSetupDeleter { + void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); } +}; +using PFFFTSetupPtr = std::unique_ptr; + +/* Convolution is implemented using a segmented overlap-add method. The filter + * response is broken up into multiple segments of 128 samples, and each + * segment has an FFT applied with a 256-sample buffer (the latter half left + * silent) to get its frequency-domain response. + * + * Input samples are similarly broken up into 128-sample segments, with an FFT + * applied with a 256-sample buffer to each new incoming segment to get its + * frequency-domain response. A history of FFT'd input segments is maintained, + * equal to the number of filter response segments. + * + * To apply the convolution, each filter response segment is convolved with its + * paired input segment (using complex multiplies, far cheaper than time-domain + * FIRs), accumulating into an FFT buffer. The input history is then shifted to + * align with later filter response segments for the next input segment. + * + * An inverse FFT is then applied to the accumulated FFT buffer to get a 256- + * sample time-domain response for output, which is split in two halves. The + * first half is the 128-sample output, and the second half is a 128-sample + * (really, 127) delayed extension, which gets added to the output next time. + * Convolving two time-domain responses of length N results in a time-domain + * signal of length N*2 - 1, and this holds true regardless of the convolution + * being applied in the frequency domain, so these "overflow" samples need to + * be accounted for. + * + * To avoid a delay with gathering enough input samples to apply an FFT with, + * the first segment is applied directly in the time-domain as the samples come + * in. Once enough have been retrieved, the FFT is applied on the input and + * it's paired with the remaining (FFT'd) filter segments for processing. + */ +template +struct SplitFilter { + static constexpr size_t sFftLength{256}; + static constexpr size_t sSampleLength{sFftLength / 2}; + static constexpr size_t sNumSegments{N/sSampleLength - 1}; + static_assert(N >= sFftLength); + static_assert((N % sSampleLength) == 0); + + PhaseShifterT mPShift{NoInit{}}; + PFFFTSetupPtr mFft; + alignas(16) std::array mFilterData; + + SplitFilter() + { + mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)}; + + using complex_d = std::complex; + constexpr size_t fft_size{N}; + constexpr size_t half_size{fft_size / 2}; + + /* To set up the filter, we need to generate the desired response. + * Start with a pure delay that passes all frequencies through. + */ + auto fftBuffer = std::make_unique(fft_size); + std::fill_n(fftBuffer.get(), fft_size, complex_d{}); + fftBuffer[half_size] = 1.0; + + /* Convert to the frequency domain, shift the phase of each bin by +90 + * degrees, then convert back to the time domain. + */ + forward_fft(al::span{fftBuffer.get(), fft_size}); + fftBuffer[0] *= std::numeric_limits::epsilon(); + for(size_t i{1};i < half_size;++i) + fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()}; + fftBuffer[half_size] *= std::numeric_limits::epsilon(); + for(size_t i{half_size+1};i < fft_size;++i) + fftBuffer[i] = std::conj(fftBuffer[fft_size - i]); + inverse_fft(al::span{fftBuffer.get(), fft_size}); + + /* Store the first segment of the buffer to apply as a time-domain + * filter (backwards for more efficient processing). + */ + auto fftiter = fftBuffer.get() + sSampleLength - 1; + for(float &coeff : mPShift.mCoeffs) + { + coeff = static_cast(fftiter->real() / double{fft_size}); + fftiter -= 2; + } + + /* The remaining segments of the filter are converted back to the + * frequency domain, each on their own (0 stuffed). + */ + auto fftTmp = std::make_unique(sFftLength); + float *filter{mFilterData.data()}; + for(size_t s{0};s < sNumSegments;++s) + { + for(size_t i{0};i < sSampleLength;++i) + fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size}; + std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{}); + forward_fft(al::span{fftBuffer.get(), sFftLength}); + + /* Convert to zdomain data for PFFFT, scaled by the FFT length so + * the iFFT result will be normalized. + */ + for(size_t i{0};i < sSampleLength;++i) + { + fftTmp[i*2 + 0] = static_cast(fftBuffer[i].real()) / float{sFftLength}; + fftTmp[i*2 + 1] = static_cast((i == 0) ? fftBuffer[sSampleLength].real() + : fftBuffer[i].imag()) / float{sFftLength}; + } + pffft_zreorder(mFft.get(), fftTmp.get(), filter, PFFFT_BACKWARD); + filter += sFftLength; + } + } +}; + +template +const SplitFilter gSplitFilter; + + const PhaseShifterT PShiftLq{}; const PhaseShifterT PShiftHq{}; @@ -87,7 +202,9 @@ template void UhjEncoder::encode(float *LeftOut, float *RightOut, const al::span InSamples, const size_t SamplesToDo) { - const auto &PShift = GetPhaseShifter::Get(); + static constexpr auto &Filter = gSplitFilter; + static_assert(sFftLength == Filter.sFftLength); + static_assert(sNumSegments == Filter.sNumSegments); ASSUME(SamplesToDo > 0); @@ -104,10 +221,78 @@ void UhjEncoder::encode(float *LeftOut, float *RightOut, mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i]; /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */ - std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset, - [](const float w, const float x) noexcept -> float - { return -0.3420201f*w + 0.5098604f*x; }); - PShift.process({mD.data(), SamplesToDo}, mWX.data()); + for(size_t base{0};base < SamplesToDo;) + { + const size_t todo{minz(Filter.sSampleLength-mFifoPos, SamplesToDo-base)}; + + /* Transform the non-delayed input and store in the later half of the + * filter input. + */ + std::transform(winput+base, winput+base+todo, xinput+base, + mWXIn.begin()+Filter.sSampleLength + mFifoPos, + [](const float w, const float x) noexcept -> float + { return -0.3420201f*w + 0.5098604f*x; }); + + /* Apply the first segment of the filter in the time domain. */ + auto buf_iter = mD.begin() + base; + Filter.mPShift.process({buf_iter, todo}, mWXIn.begin()+1 + mFifoPos); + + /* Add the other segments of the filter that were previously processed + * by the FFT. + */ + auto fifo_iter = mWXOut.begin() + mFifoPos; + std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{}); + + mFifoPos += todo; + base += todo; + + /* Check whether the input buffer is filled with new samples. */ + if(mFifoPos < Filter.sSampleLength) break; + mFifoPos = 0; + + /* Move the newest input to the front for the next iteration's history. */ + std::copy(mWXIn.cbegin()+Filter.sSampleLength, mWXIn.cend(), mWXIn.begin()); + std::fill(mWXIn.begin()+Filter.sSampleLength, mWXIn.end(), 0.0f); + + /* Overwrite the stale/oldest FFT'd segment with the newest input. */ + const size_t curseg{mCurrentSegment}; + pffft_transform(Filter.mFft.get(), mWXIn.data(), + mWXHistory.data() + curseg*Filter.sFftLength, mWorkData.data(), PFFFT_FORWARD); + + /* Convolve each input segment with its IR filter counterpart (aligned + * in time). + */ + mFftBuffer.fill(0.0f); + const float *filter{Filter.mFilterData.data()}; + const float *input{mWXHistory.data() + curseg*Filter.sFftLength}; + for(size_t s{curseg};s < sNumSegments;++s) + { + pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data()); + input += Filter.sFftLength; + filter += Filter.sFftLength; + } + input = mWXHistory.data(); + for(size_t s{0};s < curseg;++s) + { + pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data()); + input += Filter.sFftLength; + filter += Filter.sFftLength; + } + + /* Convert back to samples, writing to the output and storing the extra + * for next time. + */ + pffft_transform(Filter.mFft.get(), mFftBuffer.data(), mFftBuffer.data(), + mWorkData.data(), PFFFT_BACKWARD); + + for(size_t i{0};i < Filter.sSampleLength;++i) + mWXOut[i] = mFftBuffer[i] + mWXOut[Filter.sSampleLength+i]; + for(size_t i{0};i < Filter.sSampleLength;++i) + mWXOut[Filter.sSampleLength+i] = mFftBuffer[Filter.sSampleLength+i]; + + /* Shift the input history. */ + mCurrentSegment = curseg ? (curseg-1) : (sNumSegments-1); + } /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */ for(size_t i{0};i < SamplesToDo;++i) @@ -117,7 +302,6 @@ void UhjEncoder::encode(float *LeftOut, float *RightOut, std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin()); std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin()); std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin()); - std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin()); /* Apply a delay to the existing output to align with the input delay. */ auto *delayBuffer = mDirectDelay.data(); diff --git a/core/uhjfilter.h b/core/uhjfilter.h index df308094..b8b1e96c 100644 --- a/core/uhjfilter.h +++ b/core/uhjfilter.h @@ -51,6 +51,9 @@ struct UhjEncoderBase { template struct UhjEncoder final : public UhjEncoderBase { static constexpr size_t sFilterDelay{N/2}; + static constexpr size_t sFftLength{256}; + static constexpr size_t sSegmentSize{sFftLength/2}; + static constexpr size_t sNumSegments{N/sSegmentSize - 1}; /* Delays and processing storage for the input signal. */ alignas(16) std::array mW{}; @@ -60,11 +63,13 @@ struct UhjEncoder final : public UhjEncoderBase { alignas(16) std::array mS{}; alignas(16) std::array mD{}; - /* History and temp storage for the FIR filter. New samples should be - * written to index sFilterDelay*2 - 1. - */ - static constexpr size_t sWXInOffset{sFilterDelay*2 - 1}; - alignas(16) std::array mWX{}; + /* History and temp storage for the convolution filter. */ + size_t mFifoPos{}, mCurrentSegment{}; + alignas(16) std::array mWXIn{}; + alignas(16) std::array mWXOut{}; + alignas(16) std::array mFftBuffer{}; + alignas(16) std::array mWorkData{}; + alignas(16) std::array mWXHistory{}; alignas(16) std::array,2> mDirectDelay{}; @@ -77,8 +82,6 @@ struct UhjEncoder final : public UhjEncoderBase { */ void encode(float *LeftOut, float *RightOut, const al::span InSamples, const size_t SamplesToDo) override; - - DEF_NEWDEL(UhjEncoder) }; struct UhjEncoderIIR final : public UhjEncoderBase { -- cgit v1.2.3