diff options
Diffstat (limited to 'alc/effects')
-rw-r--r-- | alc/effects/chorus.cpp | 16 | ||||
-rw-r--r-- | alc/effects/convolution.cpp | 388 | ||||
-rw-r--r-- | alc/effects/dedicated.cpp | 8 | ||||
-rw-r--r-- | alc/effects/distortion.cpp | 4 | ||||
-rw-r--r-- | alc/effects/echo.cpp | 10 | ||||
-rw-r--r-- | alc/effects/fshifter.cpp | 12 | ||||
-rw-r--r-- | alc/effects/modulator.cpp | 150 | ||||
-rw-r--r-- | alc/effects/pshifter.cpp | 53 | ||||
-rw-r--r-- | alc/effects/reverb.cpp | 163 |
9 files changed, 501 insertions, 303 deletions
diff --git a/alc/effects/chorus.cpp b/alc/effects/chorus.cpp index 10ccf9f6..9cbc922f 100644 --- a/alc/effects/chorus.cpp +++ b/alc/effects/chorus.cpp @@ -25,6 +25,7 @@ #include <climits> #include <cstdlib> #include <iterator> +#include <vector> #include "alc/effects/base.h" #include "almalloc.h" @@ -41,7 +42,6 @@ #include "core/resampler_limits.h" #include "intrusive_ptr.h" #include "opthelpers.h" -#include "vector.h" namespace { @@ -49,7 +49,7 @@ namespace { using uint = unsigned int; struct ChorusState final : public EffectState { - al::vector<float,16> mDelayBuffer; + std::vector<float> mDelayBuffer; uint mOffset{0}; uint mLfoOffset{0}; @@ -125,16 +125,16 @@ void ChorusState::update(const ContextBase *Context, const EffectSlot *Slot, /* Gains for left and right sides */ static constexpr auto inv_sqrt2 = static_cast<float>(1.0 / al::numbers::sqrt2); - static constexpr auto lcoeffs_pw = CalcDirectionCoeffs({-1.0f, 0.0f, 0.0f}); - static constexpr auto rcoeffs_pw = CalcDirectionCoeffs({ 1.0f, 0.0f, 0.0f}); - static constexpr auto lcoeffs_nrml = CalcDirectionCoeffs({-inv_sqrt2, 0.0f, inv_sqrt2}); - static constexpr auto rcoeffs_nrml = CalcDirectionCoeffs({ inv_sqrt2, 0.0f, inv_sqrt2}); + static constexpr auto lcoeffs_pw = CalcDirectionCoeffs(std::array{-1.0f, 0.0f, 0.0f}); + static constexpr auto rcoeffs_pw = CalcDirectionCoeffs(std::array{ 1.0f, 0.0f, 0.0f}); + static constexpr auto lcoeffs_nrml = CalcDirectionCoeffs(std::array{-inv_sqrt2, 0.0f, inv_sqrt2}); + static constexpr auto rcoeffs_nrml = CalcDirectionCoeffs(std::array{ inv_sqrt2, 0.0f, inv_sqrt2}); auto &lcoeffs = (device->mRenderMode != RenderMode::Pairwise) ? lcoeffs_nrml : lcoeffs_pw; auto &rcoeffs = (device->mRenderMode != RenderMode::Pairwise) ? rcoeffs_nrml : rcoeffs_pw; mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, lcoeffs.data(), Slot->Gain, mGains[0].Target); - ComputePanGains(target.Main, rcoeffs.data(), Slot->Gain, mGains[1].Target); + ComputePanGains(target.Main, lcoeffs, Slot->Gain, mGains[0].Target); + ComputePanGains(target.Main, rcoeffs, Slot->Gain, mGains[1].Target); float rate{props->Chorus.Rate}; if(!(rate > 0.0f)) diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp index 7f36c415..517e6b08 100644 --- a/alc/effects/convolution.cpp +++ b/alc/effects/convolution.cpp @@ -17,7 +17,6 @@ #include <arm_neon.h> #endif -#include "albyte.h" #include "alcomplex.h" #include "almalloc.h" #include "alnumbers.h" @@ -35,44 +34,49 @@ #include "core/fmt_traits.h" #include "core/mixer.h" #include "intrusive_ptr.h" +#include "pffft.h" #include "polyphase_resampler.h" #include "vector.h" namespace { -/* Convolution reverb is implemented using a segmented overlap-add method. The - * impulse response is broken up into multiple segments of 128 samples, and - * each segment has an FFT applied with a 256-sample buffer (the latter half - * left silent) to get its frequency-domain response. The resulting response - * has its positive/non-mirrored frequencies saved (129 bins) in each segment. +/* Convolution is implemented using a segmented overlap-add method. The impulse + * response is split into multiple segments of 128 samples, and each segment + * has an FFT applied with a 256-sample buffer (the latter half left silent) to + * get its frequency-domain response. The resulting response has its positive/ + * non-mirrored frequencies saved (129 bins) in each segment. Note that since + * the 0- and half-frequency bins are real for a real signal, their imaginary + * components are always 0 and can be dropped, allowing their real components + * to be combined so only 128 complex values are stored for the 129 bins. * - * Input samples are similarly broken up into 128-sample segments, with an FFT - * applied to each new incoming segment to get its 129 bins. A history of FFT'd - * input segments is maintained, equal to the length of the impulse response. + * Input samples are similarly broken up into 128-sample segments, with a 256- + * sample FFT applied to each new incoming segment to get its 129 bins. A + * history of FFT'd input segments is maintained, equal to the number of + * impulse response segments. * - * To apply the reverberation, each impulse response segment is convolved with + * To apply the convolution, each impulse response segment is convolved with * its paired input segment (using complex multiplies, far cheaper than FIRs), - * accumulating into a 256-bin FFT buffer. The input history is then shifted to - * align with later impulse response segments for next time. + * accumulating into a 129-bin FFT buffer. The input history is then shifted to + * align with later impulse response segments for the next input segment. * * An inverse FFT is then applied to the accumulated FFT buffer to get a 256- * sample time-domain response for output, which is split in two halves. The * first half is the 128-sample output, and the second half is a 128-sample * (really, 127) delayed extension, which gets added to the output next time. - * Convolving two time-domain responses of lengths N and M results in a time- - * domain signal of length N+M-1, and this holds true regardless of the - * convolution being applied in the frequency domain, so these "overflow" - * samples need to be accounted for. + * Convolving two time-domain responses of length N results in a time-domain + * signal of length N*2 - 1, and this holds true regardless of the convolution + * being applied in the frequency domain, so these "overflow" samples need to + * be accounted for. * - * To avoid a delay with gathering enough input samples to apply an FFT with, - * the first segment is applied directly in the time-domain as the samples come - * in. Once enough have been retrieved, the FFT is applied on the input and - * it's paired with the remaining (FFT'd) filter segments for processing. + * To avoid a delay with gathering enough input samples for the FFT, the first + * segment is applied directly in the time-domain as the samples come in. Once + * enough have been retrieved, the FFT is applied on the input and it's paired + * with the remaining (FFT'd) filter segments for processing. */ -void LoadSamples(float *RESTRICT dst, const al::byte *src, const size_t srcstep, FmtType srctype, +void LoadSamples(float *RESTRICT dst, const std::byte *src, const size_t srcstep, FmtType srctype, const size_t samples) noexcept { #define HANDLE_FMT(T) case T: al::LoadSampleArray<T>(dst, src, srcstep, samples); break @@ -80,6 +84,7 @@ void LoadSamples(float *RESTRICT dst, const al::byte *src, const size_t srcstep, { HANDLE_FMT(FmtUByte); HANDLE_FMT(FmtShort); + HANDLE_FMT(FmtInt); HANDLE_FMT(FmtFloat); HANDLE_FMT(FmtDouble); HANDLE_FMT(FmtMulaw); @@ -94,40 +99,43 @@ void LoadSamples(float *RESTRICT dst, const al::byte *src, const size_t srcstep, } -inline auto& GetAmbiScales(AmbiScaling scaletype) noexcept +constexpr auto GetAmbiScales(AmbiScaling scaletype) noexcept { switch(scaletype) { - case AmbiScaling::FuMa: return AmbiScale::FromFuMa(); - case AmbiScaling::SN3D: return AmbiScale::FromSN3D(); - case AmbiScaling::UHJ: return AmbiScale::FromUHJ(); + case AmbiScaling::FuMa: return al::span{AmbiScale::FromFuMa}; + case AmbiScaling::SN3D: return al::span{AmbiScale::FromSN3D}; + case AmbiScaling::UHJ: return al::span{AmbiScale::FromUHJ}; case AmbiScaling::N3D: break; } - return AmbiScale::FromN3D(); + return al::span{AmbiScale::FromN3D}; } -inline auto& GetAmbiLayout(AmbiLayout layouttype) noexcept +constexpr auto GetAmbiLayout(AmbiLayout layouttype) noexcept { - if(layouttype == AmbiLayout::FuMa) return AmbiIndex::FromFuMa(); - return AmbiIndex::FromACN(); + if(layouttype == AmbiLayout::FuMa) return al::span{AmbiIndex::FromFuMa}; + return al::span{AmbiIndex::FromACN}; } -inline auto& GetAmbi2DLayout(AmbiLayout layouttype) noexcept +constexpr auto GetAmbi2DLayout(AmbiLayout layouttype) noexcept { - if(layouttype == AmbiLayout::FuMa) return AmbiIndex::FromFuMa2D(); - return AmbiIndex::FromACN2D(); + if(layouttype == AmbiLayout::FuMa) return al::span{AmbiIndex::FromFuMa2D}; + return al::span{AmbiIndex::FromACN2D}; } -struct ChanMap { +constexpr float sin30{0.5f}; +constexpr float cos30{0.866025403785f}; +constexpr float sin45{al::numbers::sqrt2_v<float>*0.5f}; +constexpr float cos45{al::numbers::sqrt2_v<float>*0.5f}; +constexpr float sin110{ 0.939692620786f}; +constexpr float cos110{-0.342020143326f}; + +struct ChanPosMap { Channel channel; - float angle; - float elevation; + std::array<float,3> pos; }; -constexpr float Deg2Rad(float x) noexcept -{ return static_cast<float>(al::numbers::pi / 180.0 * x); } - using complex_f = std::complex<float>; @@ -181,6 +189,13 @@ void apply_fir(al::span<float> dst, const float *RESTRICT src, const float *REST #endif } + +struct PFFFTSetupDeleter { + void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); } +}; +using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>; + + struct ConvolutionState final : public EffectState { FmtChannels mChannels{}; AmbiLayout mAmbiLayout{}; @@ -188,11 +203,13 @@ struct ConvolutionState final : public EffectState { uint mAmbiOrder{}; size_t mFifoPos{0}; - std::array<float,ConvolveUpdateSamples*2> mInput{}; + alignas(16) std::array<float,ConvolveUpdateSamples*2> mInput{}; al::vector<std::array<float,ConvolveUpdateSamples>,16> mFilter; al::vector<std::array<float,ConvolveUpdateSamples*2>,16> mOutput; - alignas(16) std::array<complex_f,ConvolveUpdateSize> mFftBuffer{}; + PFFFTSetupPtr mFft{}; + alignas(16) std::array<float,ConvolveUpdateSize> mFftBuffer{}; + alignas(16) std::array<float,ConvolveUpdateSize> mFftWorkBuffer{}; size_t mCurrentSegment{0}; size_t mNumConvolveSegs{0}; @@ -204,9 +221,8 @@ struct ConvolutionState final : public EffectState { float Current[MAX_OUTPUT_CHANNELS]{}; float Target[MAX_OUTPUT_CHANNELS]{}; }; - using ChannelDataArray = al::FlexArray<ChannelData>; - std::unique_ptr<ChannelDataArray> mChans; - std::unique_ptr<complex_f[]> mComplexData; + std::vector<ChannelData> mChans; + al::vector<float,16> mComplexData; ConvolutionState() = default; @@ -229,7 +245,7 @@ struct ConvolutionState final : public EffectState { void ConvolutionState::NormalMix(const al::span<FloatBufferLine> samplesOut, const size_t samplesToDo) { - for(auto &chan : *mChans) + for(auto &chan : mChans) MixSamples({chan.mBuffer.data(), samplesToDo}, samplesOut, chan.Current, chan.Target, samplesToDo, 0); } @@ -237,7 +253,7 @@ void ConvolutionState::NormalMix(const al::span<FloatBufferLine> samplesOut, void ConvolutionState::UpsampleMix(const al::span<FloatBufferLine> samplesOut, const size_t samplesToDo) { - for(auto &chan : *mChans) + for(auto &chan : mChans) { const al::span<float> src{chan.mBuffer.data(), samplesToDo}; chan.mFilter.processScale(src, chan.mHfScale, chan.mLfScale); @@ -251,19 +267,23 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag using UhjDecoderType = UhjDecoder<512>; static constexpr auto DecoderPadding = UhjDecoderType::sInputPadding; - constexpr uint MaxConvolveAmbiOrder{1u}; + static constexpr uint MaxConvolveAmbiOrder{1u}; + + if(!mFft) + mFft = PFFFTSetupPtr{pffft_new_setup(ConvolveUpdateSize, PFFFT_REAL)}; mFifoPos = 0; mInput.fill(0.0f); decltype(mFilter){}.swap(mFilter); decltype(mOutput){}.swap(mOutput); - mFftBuffer.fill(complex_f{}); + mFftBuffer.fill(0.0f); + mFftWorkBuffer.fill(0.0f); mCurrentSegment = 0; mNumConvolveSegs = 0; - mChans = nullptr; - mComplexData = nullptr; + decltype(mChans){}.swap(mChans); + decltype(mComplexData){}.swap(mComplexData); /* An empty buffer doesn't need a convolution filter. */ if(!buffer || buffer->mSampleLen < 1) return; @@ -273,12 +293,11 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag mAmbiScaling = IsUHJ(mChannels) ? AmbiScaling::UHJ : buffer->mAmbiScaling; mAmbiOrder = minu(buffer->mAmbiOrder, MaxConvolveAmbiOrder); - constexpr size_t m{ConvolveUpdateSize/2 + 1}; const auto bytesPerSample = BytesFromFmt(buffer->mType); const auto realChannels = buffer->channelsFromFmt(); const auto numChannels = (mChannels == FmtUHJ2) ? 3u : ChannelsFromFmt(mChannels, mAmbiOrder); - mChans = ChannelDataArray::Create(numChannels); + mChans.resize(numChannels); /* The impulse response needs to have the same sample rate as the input and * output. The bsinc24 resampler is decent, but there is high-frequency @@ -293,7 +312,7 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag buffer->mSampleRate); const BandSplitter splitter{device->mXOverFreq / static_cast<float>(device->Frequency)}; - for(auto &e : *mChans) + for(auto &e : mChans) e.mFilter = splitter; mFilter.resize(numChannels, {}); @@ -307,9 +326,8 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag mNumConvolveSegs = (resampledCount+(ConvolveUpdateSamples-1)) / ConvolveUpdateSamples; mNumConvolveSegs = maxz(mNumConvolveSegs, 2) - 1; - const size_t complex_length{mNumConvolveSegs * m * (numChannels+1)}; - mComplexData = std::make_unique<complex_f[]>(complex_length); - std::fill_n(mComplexData.get(), complex_length, complex_f{}); + const size_t complex_length{mNumConvolveSegs * ConvolveUpdateSize * (numChannels+1)}; + mComplexData.resize(complex_length, 0.0f); /* Load the samples from the buffer. */ const size_t srclinelength{RoundUp(buffer->mSampleLen+DecoderPadding, 16)}; @@ -330,7 +348,10 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag auto ressamples = std::make_unique<double[]>(buffer->mSampleLen + (resampler ? resampledCount : 0)); - complex_f *filteriter = mComplexData.get() + mNumConvolveSegs*m; + auto ffttmp = al::vector<float,16>(ConvolveUpdateSize); + auto fftbuffer = std::vector<std::complex<double>>(ConvolveUpdateSize); + + float *filteriter = mComplexData.data() + mNumConvolveSegs*ConvolveUpdateSize; for(size_t c{0};c < numChannels;++c) { /* Resample to match the device. */ @@ -351,71 +372,85 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag std::transform(ressamples.get(), ressamples.get()+first_size, mFilter[c].rbegin(), [](const double d) noexcept -> float { return static_cast<float>(d); }); - auto fftbuffer = std::vector<std::complex<double>>(ConvolveUpdateSize); size_t done{first_size}; for(size_t s{0};s < mNumConvolveSegs;++s) { const size_t todo{minz(resampledCount-done, ConvolveUpdateSamples)}; + /* Apply a double-precision forward FFT for more precise frequency + * measurements. + */ auto iter = std::copy_n(&ressamples[done], todo, fftbuffer.begin()); done += todo; std::fill(iter, fftbuffer.end(), std::complex<double>{}); + forward_fft(al::span{fftbuffer}); - forward_fft(al::as_span(fftbuffer)); - filteriter = std::copy_n(fftbuffer.cbegin(), m, filteriter); + /* Convert to, and pack in, a float buffer for PFFFT. Note that the + * first bin stores the real component of the half-frequency bin in + * the imaginary component. Also scale the FFT by its length so the + * iFFT'd output will be normalized. + */ + static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}}; + for(size_t i{0};i < ConvolveUpdateSamples;++i) + { + ffttmp[i*2 ] = static_cast<float>(fftbuffer[i].real()) * fftscale; + ffttmp[i*2 + 1] = static_cast<float>((i == 0) ? + fftbuffer[ConvolveUpdateSamples].real() : fftbuffer[i].imag()) * fftscale; + } + /* Reorder backward to make it suitable for pffft_zconvolve and the + * subsequent pffft_transform(..., PFFFT_BACKWARD). + */ + pffft_zreorder(mFft.get(), ffttmp.data(), al::to_address(filteriter), PFFFT_BACKWARD); + filteriter += ConvolveUpdateSize; } } } void ConvolutionState::update(const ContextBase *context, const EffectSlot *slot, - const EffectProps* /*props*/, const EffectTarget target) + const EffectProps *props, const EffectTarget target) { - /* NOTE: Stereo and Rear are slightly different from normal mixing (as - * defined in alu.cpp). These are 45 degrees from center, rather than the - * 30 degrees used there. - * - * TODO: LFE is not mixed to output. This will require each buffer channel + /* TODO: LFE is not mixed to output. This will require each buffer channel * to have its own output target since the main mixing buffer won't have an * LFE channel (due to being B-Format). */ - static constexpr ChanMap MonoMap[1]{ - { FrontCenter, 0.0f, 0.0f } + static constexpr ChanPosMap MonoMap[1]{ + { FrontCenter, std::array{0.0f, 0.0f, -1.0f} } }, StereoMap[2]{ - { FrontLeft, Deg2Rad(-45.0f), Deg2Rad(0.0f) }, - { FrontRight, Deg2Rad( 45.0f), Deg2Rad(0.0f) } + { FrontLeft, std::array{-sin30, 0.0f, -cos30} }, + { FrontRight, std::array{ sin30, 0.0f, -cos30} }, }, RearMap[2]{ - { BackLeft, Deg2Rad(-135.0f), Deg2Rad(0.0f) }, - { BackRight, Deg2Rad( 135.0f), Deg2Rad(0.0f) } + { BackLeft, std::array{-sin30, 0.0f, cos30} }, + { BackRight, std::array{ sin30, 0.0f, cos30} }, }, QuadMap[4]{ - { FrontLeft, Deg2Rad( -45.0f), Deg2Rad(0.0f) }, - { FrontRight, Deg2Rad( 45.0f), Deg2Rad(0.0f) }, - { BackLeft, Deg2Rad(-135.0f), Deg2Rad(0.0f) }, - { BackRight, Deg2Rad( 135.0f), Deg2Rad(0.0f) } + { FrontLeft, std::array{-sin45, 0.0f, -cos45} }, + { FrontRight, std::array{ sin45, 0.0f, -cos45} }, + { BackLeft, std::array{-sin45, 0.0f, cos45} }, + { BackRight, std::array{ sin45, 0.0f, cos45} }, }, X51Map[6]{ - { FrontLeft, Deg2Rad( -30.0f), Deg2Rad(0.0f) }, - { FrontRight, Deg2Rad( 30.0f), Deg2Rad(0.0f) }, - { FrontCenter, Deg2Rad( 0.0f), Deg2Rad(0.0f) }, - { LFE, 0.0f, 0.0f }, - { SideLeft, Deg2Rad(-110.0f), Deg2Rad(0.0f) }, - { SideRight, Deg2Rad( 110.0f), Deg2Rad(0.0f) } + { FrontLeft, std::array{-sin30, 0.0f, -cos30} }, + { FrontRight, std::array{ sin30, 0.0f, -cos30} }, + { FrontCenter, std::array{ 0.0f, 0.0f, -1.0f} }, + { LFE, {} }, + { SideLeft, std::array{-sin110, 0.0f, -cos110} }, + { SideRight, std::array{ sin110, 0.0f, -cos110} }, }, X61Map[7]{ - { FrontLeft, Deg2Rad(-30.0f), Deg2Rad(0.0f) }, - { FrontRight, Deg2Rad( 30.0f), Deg2Rad(0.0f) }, - { FrontCenter, Deg2Rad( 0.0f), Deg2Rad(0.0f) }, - { LFE, 0.0f, 0.0f }, - { BackCenter, Deg2Rad(180.0f), Deg2Rad(0.0f) }, - { SideLeft, Deg2Rad(-90.0f), Deg2Rad(0.0f) }, - { SideRight, Deg2Rad( 90.0f), Deg2Rad(0.0f) } + { FrontLeft, std::array{-sin30, 0.0f, -cos30} }, + { FrontRight, std::array{ sin30, 0.0f, -cos30} }, + { FrontCenter, std::array{ 0.0f, 0.0f, -1.0f} }, + { LFE, {} }, + { BackCenter, std::array{ 0.0f, 0.0f, 1.0f} }, + { SideLeft, std::array{-1.0f, 0.0f, 0.0f} }, + { SideRight, std::array{ 1.0f, 0.0f, 0.0f} }, }, X71Map[8]{ - { FrontLeft, Deg2Rad( -30.0f), Deg2Rad(0.0f) }, - { FrontRight, Deg2Rad( 30.0f), Deg2Rad(0.0f) }, - { FrontCenter, Deg2Rad( 0.0f), Deg2Rad(0.0f) }, - { LFE, 0.0f, 0.0f }, - { BackLeft, Deg2Rad(-150.0f), Deg2Rad(0.0f) }, - { BackRight, Deg2Rad( 150.0f), Deg2Rad(0.0f) }, - { SideLeft, Deg2Rad( -90.0f), Deg2Rad(0.0f) }, - { SideRight, Deg2Rad( 90.0f), Deg2Rad(0.0f) } + { FrontLeft, std::array{-sin30, 0.0f, -cos30} }, + { FrontRight, std::array{ sin30, 0.0f, -cos30} }, + { FrontCenter, std::array{ 0.0f, 0.0f, -1.0f} }, + { LFE, {} }, + { BackLeft, std::array{-sin30, 0.0f, cos30} }, + { BackRight, std::array{ sin30, 0.0f, cos30} }, + { SideLeft, std::array{ -1.0f, 0.0f, 0.0f} }, + { SideRight, std::array{ 1.0f, 0.0f, 0.0f} }, }; if(mNumConvolveSegs < 1) UNLIKELY @@ -423,7 +458,7 @@ void ConvolutionState::update(const ContextBase *context, const EffectSlot *slot mMix = &ConvolutionState::NormalMix; - for(auto &chan : *mChans) + for(auto &chan : mChans) std::fill(std::begin(chan.Target), std::end(chan.Target), 0.0f); const float gain{slot->Gain}; if(IsAmbisonic(mChannels)) @@ -432,46 +467,66 @@ void ConvolutionState::update(const ContextBase *context, const EffectSlot *slot if(mChannels == FmtUHJ2 && !device->mUhjEncoder) { mMix = &ConvolutionState::UpsampleMix; - (*mChans)[0].mHfScale = 1.0f; - (*mChans)[0].mLfScale = DecoderBase::sWLFScale; - (*mChans)[1].mHfScale = 1.0f; - (*mChans)[1].mLfScale = DecoderBase::sXYLFScale; - (*mChans)[2].mHfScale = 1.0f; - (*mChans)[2].mLfScale = DecoderBase::sXYLFScale; + mChans[0].mHfScale = 1.0f; + mChans[0].mLfScale = DecoderBase::sWLFScale; + mChans[1].mHfScale = 1.0f; + mChans[1].mLfScale = DecoderBase::sXYLFScale; + mChans[2].mHfScale = 1.0f; + mChans[2].mLfScale = DecoderBase::sXYLFScale; } else if(device->mAmbiOrder > mAmbiOrder) { mMix = &ConvolutionState::UpsampleMix; const auto scales = AmbiScale::GetHFOrderScales(mAmbiOrder, device->mAmbiOrder, device->m2DMixing); - (*mChans)[0].mHfScale = scales[0]; - (*mChans)[0].mLfScale = 1.0f; - for(size_t i{1};i < mChans->size();++i) + mChans[0].mHfScale = scales[0]; + mChans[0].mLfScale = 1.0f; + for(size_t i{1};i < mChans.size();++i) { - (*mChans)[i].mHfScale = scales[1]; - (*mChans)[i].mLfScale = 1.0f; + mChans[i].mHfScale = scales[1]; + mChans[i].mLfScale = 1.0f; } } mOutTarget = target.Main->Buffer; - auto&& scales = GetAmbiScales(mAmbiScaling); + alu::Vector N{props->Convolution.OrientAt[0], props->Convolution.OrientAt[1], + props->Convolution.OrientAt[2], 0.0f}; + N.normalize(); + alu::Vector V{props->Convolution.OrientUp[0], props->Convolution.OrientUp[1], + props->Convolution.OrientUp[2], 0.0f}; + V.normalize(); + /* Build and normalize right-vector */ + alu::Vector U{N.cross_product(V)}; + U.normalize(); + + const float mixmatrix[4][4]{ + {1.0f, 0.0f, 0.0f, 0.0f}, + {0.0f, U[0], -U[1], U[2]}, + {0.0f, -V[0], V[1], -V[2]}, + {0.0f, -N[0], N[1], -N[2]}, + }; + + const auto scales = GetAmbiScales(mAmbiScaling); const uint8_t *index_map{Is2DAmbisonic(mChannels) ? GetAmbi2DLayout(mAmbiLayout).data() : GetAmbiLayout(mAmbiLayout).data()}; std::array<float,MaxAmbiChannels> coeffs{}; - for(size_t c{0u};c < mChans->size();++c) + for(size_t c{0u};c < mChans.size();++c) { const size_t acn{index_map[c]}; - coeffs[acn] = scales[acn]; - ComputePanGains(target.Main, coeffs.data(), gain, (*mChans)[c].Target); - coeffs[acn] = 0.0f; + const float scale{scales[acn]}; + + for(size_t x{0};x < 4;++x) + coeffs[x] = mixmatrix[acn][x] * scale; + + ComputePanGains(target.Main, coeffs, gain, mChans[c].Target); } } else { DeviceBase *device{context->mDevice}; - al::span<const ChanMap> chanmap{}; + al::span<const ChanPosMap> chanmap{}; switch(mChannels) { case FmtMono: chanmap = MonoMap; break; @@ -493,28 +548,55 @@ void ConvolutionState::update(const ContextBase *context, const EffectSlot *slot mOutTarget = target.Main->Buffer; if(device->mRenderMode == RenderMode::Pairwise) { - auto ScaleAzimuthFront = [](float azimuth, float scale) -> float + /* Scales the azimuth of the given vector by 3 if it's in front. + * Effectively scales +/-30 degrees to +/-90 degrees, leaving > +90 + * and < -90 alone. + */ + auto ScaleAzimuthFront = [](std::array<float,3> pos) -> std::array<float,3> { - constexpr float half_pi{al::numbers::pi_v<float>*0.5f}; - const float abs_azi{std::fabs(azimuth)}; - if(!(abs_azi >= half_pi)) - return std::copysign(minf(abs_azi*scale, half_pi), azimuth); - return azimuth; + if(pos[2] < 0.0f) + { + /* Normalize the length of the x,z components for a 2D + * vector of the azimuth angle. Negate Z since {0,0,-1} is + * angle 0. + */ + const float len2d{std::sqrt(pos[0]*pos[0] + pos[2]*pos[2])}; + float x{pos[0] / len2d}; + float z{-pos[2] / len2d}; + + /* Z > cos(pi/6) = -30 < azimuth < 30 degrees. */ + if(z > cos30) + { + /* Triple the angle represented by x,z. */ + x = x*3.0f - x*x*x*4.0f; + z = z*z*z*4.0f - z*3.0f; + + /* Scale the vector back to fit in 3D. */ + pos[0] = x * len2d; + pos[2] = -z * len2d; + } + else + { + /* If azimuth >= 30 degrees, clamp to 90 degrees. */ + pos[0] = std::copysign(len2d, pos[0]); + pos[2] = 0.0f; + } + } + return pos; }; for(size_t i{0};i < chanmap.size();++i) { if(chanmap[i].channel == LFE) continue; - const auto coeffs = CalcAngleCoeffs(ScaleAzimuthFront(chanmap[i].angle, 2.0f), - chanmap[i].elevation, 0.0f); - ComputePanGains(target.Main, coeffs.data(), gain, (*mChans)[i].Target); + const auto coeffs = CalcDirectionCoeffs(ScaleAzimuthFront(chanmap[i].pos), 0.0f); + ComputePanGains(target.Main, coeffs, gain, mChans[i].Target); } } else for(size_t i{0};i < chanmap.size();++i) { if(chanmap[i].channel == LFE) continue; - const auto coeffs = CalcAngleCoeffs(chanmap[i].angle, chanmap[i].elevation, 0.0f); - ComputePanGains(target.Main, coeffs.data(), gain, (*mChans)[i].Target); + const auto coeffs = CalcDirectionCoeffs(chanmap[i].pos, 0.0f); + ComputePanGains(target.Main, coeffs, gain, mChans[i].Target); } } } @@ -525,9 +607,7 @@ void ConvolutionState::process(const size_t samplesToDo, if(mNumConvolveSegs < 1) UNLIKELY return; - constexpr size_t m{ConvolveUpdateSize/2 + 1}; size_t curseg{mCurrentSegment}; - auto &chans = *mChans; for(size_t base{0u};base < samplesToDo;) { @@ -539,9 +619,9 @@ void ConvolutionState::process(const size_t samplesToDo, /* Apply the FIR for the newly retrieved input samples, and combine it * with the inverse FFT'd output samples. */ - for(size_t c{0};c < chans.size();++c) + for(size_t c{0};c < mChans.size();++c) { - auto buf_iter = chans[c].mBuffer.begin() + base; + auto buf_iter = mChans[c].mBuffer.begin() + base; apply_fir({buf_iter, todo}, mInput.data()+1 + mFifoPos, mFilter[c].data()); auto fifo_iter = mOutput[c].begin() + mFifoPos; @@ -557,59 +637,49 @@ void ConvolutionState::process(const size_t samplesToDo, /* Move the newest input to the front for the next iteration's history. */ std::copy(mInput.cbegin()+ConvolveUpdateSamples, mInput.cend(), mInput.begin()); + std::fill(mInput.begin()+ConvolveUpdateSamples, mInput.end(), 0.0f); - /* Calculate the frequency domain response and add the relevant + /* Calculate the frequency-domain response and add the relevant * frequency bins to the FFT history. */ - auto fftiter = std::copy_n(mInput.cbegin(), ConvolveUpdateSamples, mFftBuffer.begin()); - std::fill(fftiter, mFftBuffer.end(), complex_f{}); - forward_fft(al::as_span(mFftBuffer)); + pffft_transform(mFft.get(), mInput.data(), mComplexData.data() + curseg*ConvolveUpdateSize, + mFftWorkBuffer.data(), PFFFT_FORWARD); - std::copy_n(mFftBuffer.cbegin(), m, &mComplexData[curseg*m]); - - const complex_f *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*m}; - for(size_t c{0};c < chans.size();++c) + const float *filter{mComplexData.data() + mNumConvolveSegs*ConvolveUpdateSize}; + for(size_t c{0};c < mChans.size();++c) { - std::fill_n(mFftBuffer.begin(), m, complex_f{}); - /* Convolve each input segment with its IR filter counterpart * (aligned in time). */ - const complex_f *RESTRICT input{&mComplexData[curseg*m]}; + mFftBuffer.fill(0.0f); + const float *input{&mComplexData[curseg*ConvolveUpdateSize]}; for(size_t s{curseg};s < mNumConvolveSegs;++s) { - for(size_t i{0};i < m;++i,++input,++filter) - mFftBuffer[i] += *input * *filter; + pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data()); + input += ConvolveUpdateSize; + filter += ConvolveUpdateSize; } - input = mComplexData.get(); + input = mComplexData.data(); for(size_t s{0};s < curseg;++s) { - for(size_t i{0};i < m;++i,++input,++filter) - mFftBuffer[i] += *input * *filter; + pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data()); + input += ConvolveUpdateSize; + filter += ConvolveUpdateSize; } - /* Reconstruct the mirrored/negative frequencies to do a proper - * inverse FFT. - */ - for(size_t i{m};i < ConvolveUpdateSize;++i) - mFftBuffer[i] = std::conj(mFftBuffer[ConvolveUpdateSize-i]); - /* Apply iFFT to get the 256 (really 255) samples for output. The * 128 output samples are combined with the last output's 127 * second-half samples (and this output's second half is * subsequently saved for next time). */ - inverse_fft(al::as_span(mFftBuffer)); + pffft_transform(mFft.get(), mFftBuffer.data(), mFftBuffer.data(), + mFftWorkBuffer.data(), PFFFT_BACKWARD); - /* The iFFT'd response is scaled up by the number of bins, so apply - * the inverse to normalize the output. - */ + /* The filter was attenuated, so the response is already scaled. */ for(size_t i{0};i < ConvolveUpdateSamples;++i) - mOutput[c][i] = - (mFftBuffer[i].real()+mOutput[c][ConvolveUpdateSamples+i]) * - (1.0f/float{ConvolveUpdateSize}); + mOutput[c][i] = mFftBuffer[i] + mOutput[c][ConvolveUpdateSamples+i]; for(size_t i{0};i < ConvolveUpdateSamples;++i) - mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i].real(); + mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i]; } /* Shift the input history. */ diff --git a/alc/effects/dedicated.cpp b/alc/effects/dedicated.cpp index 047e6761..a9131bfa 100644 --- a/alc/effects/dedicated.cpp +++ b/alc/effects/dedicated.cpp @@ -74,7 +74,7 @@ void DedicatedState::update(const ContextBase*, const EffectSlot *slot, if(slot->EffectType == EffectSlotType::DedicatedLFE) { - const uint idx{target.RealOut ? target.RealOut->ChannelIndex[LFE] : InvalidChannelIndex}; + const size_t idx{target.RealOut ? target.RealOut->ChannelIndex[LFE] : InvalidChannelIndex}; if(idx != InvalidChannelIndex) { mOutTarget = target.RealOut->Buffer; @@ -85,7 +85,7 @@ void DedicatedState::update(const ContextBase*, const EffectSlot *slot, { /* Dialog goes to the front-center speaker if it exists, otherwise it * plays from the front-center location. */ - const uint idx{target.RealOut ? target.RealOut->ChannelIndex[FrontCenter] + const size_t idx{target.RealOut ? target.RealOut->ChannelIndex[FrontCenter] : InvalidChannelIndex}; if(idx != InvalidChannelIndex) { @@ -94,10 +94,10 @@ void DedicatedState::update(const ContextBase*, const EffectSlot *slot, } else { - static constexpr auto coeffs = CalcDirectionCoeffs({0.0f, 0.0f, -1.0f}); + static constexpr auto coeffs = CalcDirectionCoeffs(std::array{0.0f, 0.0f, -1.0f}); mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, coeffs.data(), Gain, mTargetGains); + ComputePanGains(target.Main, coeffs, Gain, mTargetGains); } } } diff --git a/alc/effects/distortion.cpp b/alc/effects/distortion.cpp index b4e2167e..3d77ff35 100644 --- a/alc/effects/distortion.cpp +++ b/alc/effects/distortion.cpp @@ -95,10 +95,10 @@ void DistortionState::update(const ContextBase *context, const EffectSlot *slot, bandwidth = props->Distortion.EQBandwidth / (cutoff * 0.67f); mBandpass.setParamsFromBandwidth(BiquadType::BandPass, cutoff/frequency/4.0f, 1.0f, bandwidth); - static constexpr auto coeffs = CalcDirectionCoeffs({0.0f, 0.0f, -1.0f}); + static constexpr auto coeffs = CalcDirectionCoeffs(std::array{0.0f, 0.0f, -1.0f}); mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, coeffs.data(), slot->Gain*props->Distortion.Gain, mGain); + ComputePanGains(target.Main, coeffs, slot->Gain*props->Distortion.Gain, mGain); } void DistortionState::process(const size_t samplesToDo, const al::span<const FloatBufferLine> samplesIn, const al::span<FloatBufferLine> samplesOut) diff --git a/alc/effects/echo.cpp b/alc/effects/echo.cpp index a69529dc..714649c9 100644 --- a/alc/effects/echo.cpp +++ b/alc/effects/echo.cpp @@ -25,6 +25,7 @@ #include <cstdlib> #include <iterator> #include <tuple> +#include <vector> #include "alc/effects/base.h" #include "almalloc.h" @@ -39,7 +40,6 @@ #include "core/mixer.h" #include "intrusive_ptr.h" #include "opthelpers.h" -#include "vector.h" namespace { @@ -49,7 +49,7 @@ using uint = unsigned int; constexpr float LowpassFreqRef{5000.0f}; struct EchoState final : public EffectState { - al::vector<float,16> mSampleBuffer; + std::vector<float> mSampleBuffer; // The echo is two tap. The delay is the number of samples from before the // current offset @@ -87,7 +87,7 @@ void EchoState::deviceUpdate(const DeviceBase *Device, const BufferStorage*) const uint maxlen{NextPowerOf2(float2uint(EchoMaxDelay*frequency + 0.5f) + float2uint(EchoMaxLRDelay*frequency + 0.5f))}; if(maxlen != mSampleBuffer.size()) - al::vector<float,16>(maxlen).swap(mSampleBuffer); + decltype(mSampleBuffer)(maxlen).swap(mSampleBuffer); std::fill(mSampleBuffer.begin(), mSampleBuffer.end(), 0.0f); for(auto &e : mGains) @@ -118,8 +118,8 @@ void EchoState::update(const ContextBase *context, const EffectSlot *slot, const auto coeffs1 = CalcAngleCoeffs( angle, 0.0f, 0.0f); mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, coeffs0.data(), slot->Gain, mGains[0].Target); - ComputePanGains(target.Main, coeffs1.data(), slot->Gain, mGains[1].Target); + ComputePanGains(target.Main, coeffs0, slot->Gain, mGains[0].Target); + ComputePanGains(target.Main, coeffs1, slot->Gain, mGains[1].Target); } void EchoState::process(const size_t samplesToDo, const al::span<const FloatBufferLine> samplesIn, const al::span<FloatBufferLine> samplesOut) diff --git a/alc/effects/fshifter.cpp b/alc/effects/fshifter.cpp index 3e6a7385..d3989e84 100644 --- a/alc/effects/fshifter.cpp +++ b/alc/effects/fshifter.cpp @@ -164,16 +164,16 @@ void FshifterState::update(const ContextBase *context, const EffectSlot *slot, } static constexpr auto inv_sqrt2 = static_cast<float>(1.0 / al::numbers::sqrt2); - static constexpr auto lcoeffs_pw = CalcDirectionCoeffs({-1.0f, 0.0f, 0.0f}); - static constexpr auto rcoeffs_pw = CalcDirectionCoeffs({ 1.0f, 0.0f, 0.0f}); - static constexpr auto lcoeffs_nrml = CalcDirectionCoeffs({-inv_sqrt2, 0.0f, inv_sqrt2}); - static constexpr auto rcoeffs_nrml = CalcDirectionCoeffs({ inv_sqrt2, 0.0f, inv_sqrt2}); + static constexpr auto lcoeffs_pw = CalcDirectionCoeffs(std::array{-1.0f, 0.0f, 0.0f}); + static constexpr auto rcoeffs_pw = CalcDirectionCoeffs(std::array{ 1.0f, 0.0f, 0.0f}); + static constexpr auto lcoeffs_nrml = CalcDirectionCoeffs(std::array{-inv_sqrt2, 0.0f, inv_sqrt2}); + static constexpr auto rcoeffs_nrml = CalcDirectionCoeffs(std::array{ inv_sqrt2, 0.0f, inv_sqrt2}); auto &lcoeffs = (device->mRenderMode != RenderMode::Pairwise) ? lcoeffs_nrml : lcoeffs_pw; auto &rcoeffs = (device->mRenderMode != RenderMode::Pairwise) ? rcoeffs_nrml : rcoeffs_pw; mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, lcoeffs.data(), slot->Gain, mGains[0].Target); - ComputePanGains(target.Main, rcoeffs.data(), slot->Gain, mGains[1].Target); + ComputePanGains(target.Main, lcoeffs, slot->Gain, mGains[0].Target); + ComputePanGains(target.Main, rcoeffs, slot->Gain, mGains[1].Target); } void FshifterState::process(const size_t samplesToDo, const al::span<const FloatBufferLine> samplesIn, const al::span<FloatBufferLine> samplesOut) diff --git a/alc/effects/modulator.cpp b/alc/effects/modulator.cpp index 14ee5004..f99ba19c 100644 --- a/alc/effects/modulator.cpp +++ b/alc/effects/modulator.cpp @@ -45,43 +45,49 @@ namespace { using uint = unsigned int; -#define MAX_UPDATE_SAMPLES 128 +inline float Sin(uint index, float scale) +{ return std::sin(static_cast<float>(index) * scale); } -#define WAVEFORM_FRACBITS 24 -#define WAVEFORM_FRACONE (1<<WAVEFORM_FRACBITS) -#define WAVEFORM_FRACMASK (WAVEFORM_FRACONE-1) +inline float Saw(uint index, float scale) +{ return static_cast<float>(index)*scale - 1.0f; } -inline float Sin(uint index) -{ - constexpr float scale{al::numbers::pi_v<float>*2.0f / WAVEFORM_FRACONE}; - return std::sin(static_cast<float>(index) * scale); -} +inline float Square(uint index, float scale) +{ return (static_cast<float>(index)*scale < 0.5f)*2.0f - 1.0f; } -inline float Saw(uint index) -{ return static_cast<float>(index)*(2.0f/WAVEFORM_FRACONE) - 1.0f; } +inline float One(uint, float) +{ return 1.0f; } -inline float Square(uint index) -{ return static_cast<float>(static_cast<int>((index>>(WAVEFORM_FRACBITS-2))&2) - 1); } +struct ModulatorState final : public EffectState { + template<float (&func)(uint,float)> + void Modulate(size_t todo) + { + const uint range{mRange}; + const float scale{mIndexScale}; + uint index{mIndex}; -inline float One(uint) { return 1.0f; } + ASSUME(range > 1); + ASSUME(todo > 0); -template<float (&func)(uint)> -void Modulate(float *RESTRICT dst, uint index, const uint step, size_t todo) -{ - for(size_t i{0u};i < todo;i++) - { - index += step; - index &= WAVEFORM_FRACMASK; - dst[i] = func(index); + for(size_t i{0};i < todo;) + { + size_t rem{minz(todo-i, range-index)}; + do { + mModSamples[i++] = func(index++, scale); + } while(--rem); + if(index == range) + index = 0; + } + mIndex = index; } -} - -struct ModulatorState final : public EffectState { - void (*mGetSamples)(float*RESTRICT, uint, const uint, size_t){}; + void (ModulatorState::*mGenModSamples)(size_t){}; uint mIndex{0}; - uint mStep{1}; + uint mRange{1}; + float mIndexScale{0.0f}; + + alignas(16) FloatBufferLine mModSamples{}; + alignas(16) FloatBufferLine mBuffer{}; struct { uint mTargetChannel{InvalidChannelIndex}; @@ -102,6 +108,13 @@ struct ModulatorState final : public EffectState { DEF_NEWDEL(ModulatorState) }; +template<> +void ModulatorState::Modulate<One>(size_t todo) +{ + std::fill_n(mModSamples.begin(), todo, 1.0f); + mIndex = 0; +} + void ModulatorState::deviceUpdate(const DeviceBase*, const BufferStorage*) { for(auto &e : mChans) @@ -117,17 +130,46 @@ void ModulatorState::update(const ContextBase *context, const EffectSlot *slot, { const DeviceBase *device{context->mDevice}; - const float step{props->Modulator.Frequency / static_cast<float>(device->Frequency)}; - mStep = fastf2u(clampf(step*WAVEFORM_FRACONE, 0.0f, float{WAVEFORM_FRACONE-1})); - - if(mStep == 0) - mGetSamples = Modulate<One>; + /* The effective frequency will be adjusted to have a whole number of + * samples per cycle (at 48khz, that allows 8000, 6857.14, 6000, 5333.33, + * 4800, etc). We could do better by using fixed-point stepping over a sin + * function, with additive synthesis for the square and sawtooth waveforms, + * but that may need a more efficient sin function since it needs to do + * many iterations per sample. + */ + const float samplesPerCycle{props->Modulator.Frequency > 0.0f + ? static_cast<float>(device->Frequency)/props->Modulator.Frequency + 0.5f + : 1.0f}; + const uint range{static_cast<uint>(clampf(samplesPerCycle, 1.0f, + static_cast<float>(device->Frequency)))}; + mIndex = static_cast<uint>(uint64_t{mIndex} * range / mRange); + mRange = range; + + if(mRange == 1) + { + mIndexScale = 0.0f; + mGenModSamples = &ModulatorState::Modulate<One>; + } else if(props->Modulator.Waveform == ModulatorWaveform::Sinusoid) - mGetSamples = Modulate<Sin>; + { + mIndexScale = al::numbers::pi_v<float>*2.0f / static_cast<float>(mRange); + mGenModSamples = &ModulatorState::Modulate<Sin>; + } else if(props->Modulator.Waveform == ModulatorWaveform::Sawtooth) - mGetSamples = Modulate<Saw>; + { + mIndexScale = 2.0f / static_cast<float>(mRange-1); + mGenModSamples = &ModulatorState::Modulate<Saw>; + } else /*if(props->Modulator.Waveform == ModulatorWaveform::Square)*/ - mGetSamples = Modulate<Square>; + { + /* For square wave, the range should be even (there should be an equal + * number of high and low samples). An odd number of samples per cycle + * would need a more complex value generator. + */ + mRange = (mRange+1) & ~1u; + mIndexScale = 1.0f / static_cast<float>(mRange-1); + mGenModSamples = &ModulatorState::Modulate<Square>; + } float f0norm{props->Modulator.HighPassCutoff / static_cast<float>(device->Frequency)}; f0norm = clampf(f0norm, 1.0f/512.0f, 0.49f); @@ -147,34 +189,22 @@ void ModulatorState::update(const ContextBase *context, const EffectSlot *slot, void ModulatorState::process(const size_t samplesToDo, const al::span<const FloatBufferLine> samplesIn, const al::span<FloatBufferLine> samplesOut) { - for(size_t base{0u};base < samplesToDo;) - { - alignas(16) float modsamples[MAX_UPDATE_SAMPLES]; - const size_t td{minz(MAX_UPDATE_SAMPLES, samplesToDo-base)}; - - mGetSamples(modsamples, mIndex, mStep, td); - mIndex += static_cast<uint>(mStep * td); - mIndex &= WAVEFORM_FRACMASK; + (this->*mGenModSamples)(samplesToDo); - auto chandata = std::begin(mChans); - for(const auto &input : samplesIn) + auto chandata = std::begin(mChans); + for(const auto &input : samplesIn) + { + const size_t outidx{chandata->mTargetChannel}; + if(outidx != InvalidChannelIndex) { - const size_t outidx{chandata->mTargetChannel}; - if(outidx != InvalidChannelIndex) - { - alignas(16) float temps[MAX_UPDATE_SAMPLES]; - - chandata->mFilter.process({&input[base], td}, temps); - for(size_t i{0u};i < td;i++) - temps[i] *= modsamples[i]; - - MixSamples({temps, td}, samplesOut[outidx].data()+base, chandata->mCurrentGain, - chandata->mTargetGain, samplesToDo-base); - } - ++chandata; - } + chandata->mFilter.process({input.data(), samplesToDo}, mBuffer.data()); + for(size_t i{0u};i < samplesToDo;++i) + mBuffer[i] *= mModSamples[i]; - base += td; + MixSamples({mBuffer.data(), samplesToDo}, samplesOut[outidx].data(), + chandata->mCurrentGain, chandata->mTargetGain, minz(samplesToDo, 64)); + } + ++chandata; } } diff --git a/alc/effects/pshifter.cpp b/alc/effects/pshifter.cpp index 426a2264..0c27be30 100644 --- a/alc/effects/pshifter.cpp +++ b/alc/effects/pshifter.cpp @@ -28,7 +28,6 @@ #include <iterator> #include "alc/effects/base.h" -#include "alcomplex.h" #include "almalloc.h" #include "alnumbers.h" #include "alnumeric.h" @@ -40,6 +39,7 @@ #include "core/mixer.h" #include "core/mixer/defs.h" #include "intrusive_ptr.h" +#include "pffft.h" struct ContextBase; @@ -74,6 +74,12 @@ struct Windower { const Windower gWindow{}; +struct PFFFTSetupDeleter { + void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); } +}; +using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>; + + struct FrequencyBin { float Magnitude; float FreqBin; @@ -93,7 +99,9 @@ struct PshifterState final : public EffectState { std::array<float,StftHalfSize+1> mSumPhase; std::array<float,StftSize> mOutputAccum; - std::array<complex_f,StftSize> mFftBuffer; + PFFFTSetupPtr mFft; + alignas(16) std::array<float,StftSize> mFftBuffer; + alignas(16) std::array<float,StftSize> mFftWorkBuffer; std::array<FrequencyBin,StftHalfSize+1> mAnalysisBuffer; std::array<FrequencyBin,StftHalfSize+1> mSynthesisBuffer; @@ -126,12 +134,15 @@ void PshifterState::deviceUpdate(const DeviceBase*, const BufferStorage*) mLastPhase.fill(0.0f); mSumPhase.fill(0.0f); mOutputAccum.fill(0.0f); - mFftBuffer.fill(complex_f{}); + mFftBuffer.fill(0.0f); mAnalysisBuffer.fill(FrequencyBin{}); mSynthesisBuffer.fill(FrequencyBin{}); std::fill(std::begin(mCurrentGains), std::end(mCurrentGains), 0.0f); std::fill(std::begin(mTargetGains), std::end(mTargetGains), 0.0f); + + if(!mFft) + mFft = PFFFTSetupPtr{pffft_new_setup(StftSize, PFFFT_REAL)}; } void PshifterState::update(const ContextBase*, const EffectSlot *slot, @@ -142,10 +153,10 @@ void PshifterState::update(const ContextBase*, const EffectSlot *slot, mPitchShiftI = clampu(fastf2u(pitch*MixerFracOne), MixerFracHalf, MixerFracOne*2); mPitchShift = static_cast<float>(mPitchShiftI) * float{1.0f/MixerFracOne}; - static constexpr auto coeffs = CalcDirectionCoeffs({0.0f, 0.0f, -1.0f}); + static constexpr auto coeffs = CalcDirectionCoeffs(std::array{0.0f, 0.0f, -1.0f}); mOutTarget = target.Main->Buffer; - ComputePanGains(target.Main, coeffs.data(), slot->Gain, mTargetGains); + ComputePanGains(target.Main, coeffs, slot->Gain, mTargetGains); } void PshifterState::process(const size_t samplesToDo, @@ -186,15 +197,19 @@ void PshifterState::process(const size_t samplesToDo, mFftBuffer[k] = mFIFO[src] * gWindow.mData[k]; for(size_t src{0u}, k{StftSize-mPos};src < mPos;++src,++k) mFftBuffer[k] = mFIFO[src] * gWindow.mData[k]; - forward_fft(al::as_span(mFftBuffer)); + pffft_transform_ordered(mFft.get(), mFftBuffer.data(), mFftBuffer.data(), + mFftWorkBuffer.data(), PFFFT_FORWARD); /* Analyze the obtained data. Since the real FFT is symmetric, only * StftHalfSize+1 samples are needed. */ - for(size_t k{0u};k < StftHalfSize+1;k++) + for(size_t k{0u};k < StftHalfSize+1;++k) { - const float magnitude{std::abs(mFftBuffer[k])}; - const float phase{std::arg(mFftBuffer[k])}; + const auto cplx = (k == 0) ? complex_f{mFftBuffer[0]} : + (k == StftHalfSize) ? complex_f{mFftBuffer[1]} : + complex_f{mFftBuffer[k*2], mFftBuffer[k*2 + 1]}; + const float magnitude{std::abs(cplx)}; + const float phase{std::arg(cplx)}; /* Compute the phase difference from the last update and subtract * the expected phase difference for this bin. @@ -266,21 +281,29 @@ void PshifterState::process(const size_t samplesToDo, tmp -= static_cast<float>(qpd + (qpd%2)); mSumPhase[k] = tmp * al::numbers::pi_v<float>; - mFftBuffer[k] = std::polar(mSynthesisBuffer[k].Magnitude, mSumPhase[k]); + const complex_f cplx{std::polar(mSynthesisBuffer[k].Magnitude, mSumPhase[k])}; + if(k == 0) + mFftBuffer[0] = cplx.real(); + else if(k == StftHalfSize) + mFftBuffer[1] = cplx.real(); + else + { + mFftBuffer[k*2 + 0] = cplx.real(); + mFftBuffer[k*2 + 1] = cplx.imag(); + } } - for(size_t k{StftHalfSize+1};k < StftSize;++k) - mFftBuffer[k] = std::conj(mFftBuffer[StftSize-k]); /* Apply an inverse FFT to get the time-domain signal, and accumulate * for the output with windowing. */ - inverse_fft(al::as_span(mFftBuffer)); + pffft_transform_ordered(mFft.get(), mFftBuffer.data(), mFftBuffer.data(), + mFftWorkBuffer.data(), PFFFT_BACKWARD); static constexpr float scale{3.0f / OversampleFactor / StftSize}; for(size_t dst{mPos}, k{0u};dst < StftSize;++dst,++k) - mOutputAccum[dst] += gWindow.mData[k]*mFftBuffer[k].real() * scale; + mOutputAccum[dst] += gWindow.mData[k]*mFftBuffer[k] * scale; for(size_t dst{0u}, k{StftSize-mPos};dst < mPos;++dst,++k) - mOutputAccum[dst] += gWindow.mData[k]*mFftBuffer[k].real() * scale; + mOutputAccum[dst] += gWindow.mData[k]*mFftBuffer[k] * scale; /* Copy out the accumulated result, then clear for the next iteration. */ std::copy_n(mOutputAccum.begin() + mPos, StftStep, mFIFO.begin() + mPos); diff --git a/alc/effects/reverb.cpp b/alc/effects/reverb.cpp index 3875bedb..0f1fcca1 100644 --- a/alc/effects/reverb.cpp +++ b/alc/effects/reverb.cpp @@ -98,8 +98,6 @@ struct CubicFilter { constexpr CubicFilter gCubicTable; -using namespace std::placeholders; - /* Max samples per process iteration. Used to limit the size needed for * temporary buffers. Must be a multiple of 4 for SIMD alignment. */ @@ -122,12 +120,9 @@ constexpr size_t NUM_LINES{4u}; constexpr float MODULATION_DEPTH_COEFF{0.05f}; -/* The B-Format to A-Format conversion matrix. The arrangement of rows is - * deliberately chosen to align the resulting lines to their spatial opposites - * (0:above front left <-> 3:above back right, 1:below front right <-> 2:below - * back left). It's not quite opposite, since the A-Format results in a - * tetrahedron, but it's close enough. Should the model be extended to 8-lines - * in the future, true opposites can be used. +/* The B-Format to (W-normalized) A-Format conversion matrix. This produces a + * tetrahedral array of discrete signals (boosted by a factor of sqrt(3), to + * reduce the error introduced in the conversion). */ alignas(16) constexpr float B2A[NUM_LINES][NUM_LINES]{ { 0.5f, 0.5f, 0.5f, 0.5f }, @@ -136,7 +131,9 @@ alignas(16) constexpr float B2A[NUM_LINES][NUM_LINES]{ { 0.5f, -0.5f, 0.5f, -0.5f } }; -/* Converts A-Format to B-Format for early reflections. */ +/* Converts (W-normalized) A-Format to B-Format for early reflections (scaled + * by 1/sqrt(3) to compensate for the boost in the B2A matrix). + */ alignas(16) constexpr std::array<std::array<float,NUM_LINES>,NUM_LINES> EarlyA2B{{ {{ 0.5f, 0.5f, 0.5f, 0.5f }}, {{ 0.5f, -0.5f, 0.5f, -0.5f }}, @@ -144,7 +141,11 @@ alignas(16) constexpr std::array<std::array<float,NUM_LINES>,NUM_LINES> EarlyA2B {{ 0.5f, 0.5f, -0.5f, -0.5f }} }}; -/* Converts A-Format to B-Format for late reverb. */ +/* Converts (W-normalized) A-Format to B-Format for late reverb (scaled + * by 1/sqrt(3) to compensate for the boost in the B2A matrix). The response + * is rotated around Z (ambisonic X) so that the front lines are placed + * horizontally in front, and the rear lines are placed vertically in back. + */ constexpr auto InvSqrt2 = static_cast<float>(1.0/al::numbers::sqrt2); alignas(16) constexpr std::array<std::array<float,NUM_LINES>,NUM_LINES> LateA2B{{ {{ 0.5f, 0.5f, 0.5f, 0.5f }}, @@ -330,6 +331,39 @@ struct DelayLineI { } while(--td); } } + + /* Writes the given input lines to the delay buffer, applying a geometric + * reflection. This effectively applies the matrix + * + * [ -1/2 +1/2 +1/2 +1/2 ] + * [ +1/2 -1/2 +1/2 +1/2 ] + * [ +1/2 +1/2 -1/2 +1/2 ] + * [ +1/2 +1/2 +1/2 -1/2 ] + * + * to the four input lines when writing to the delay buffer. The effect on + * the B-Format signal is negating X,Y,Z, moving each response to its + * spatially opposite location. + */ + void writeReflected(size_t offset, const al::span<const ReverbUpdateLine,NUM_LINES> in, + const size_t count) const noexcept + { + ASSUME(count > 0); + for(size_t i{0u};i < count;) + { + offset &= Mask; + size_t td{minz(Mask+1 - offset, count - i)}; + do { + const std::array src{in[0][i], in[1][i], in[2][i], in[3][i]}; + ++i; + + Line[offset][0] = ( src[1] + src[2] + src[3] - src[0]) * 0.5f; + Line[offset][1] = (src[0] + src[2] + src[3] - src[1]) * 0.5f; + Line[offset][2] = (src[0] + src[1] + src[3] - src[2]) * 0.5f; + Line[offset][3] = (src[0] + src[1] + src[2] - src[3]) * 0.5f; + ++offset; + } while(--td); + } + } }; struct VecAllpass { @@ -461,8 +495,9 @@ struct ReverbPipeline { void updateDelayLine(const float earlyDelay, const float lateDelay, const float density_mult, const float decayTime, const float frequency); - void update3DPanning(const float *ReflectionsPan, const float *LateReverbPan, - const float earlyGain, const float lateGain, const bool doUpmix, const MixParams *mainMix); + void update3DPanning(const al::span<const float,3> ReflectionsPan, + const al::span<const float,3> LateReverbPan, const float earlyGain, const float lateGain, + const bool doUpmix, const MixParams *mainMix); void processEarly(size_t offset, const size_t samplesToDo, const al::span<ReverbUpdateLine,NUM_LINES> tempSamples, @@ -643,8 +678,8 @@ inline float CalcDelayLengthMult(float density) */ void ReverbState::allocLines(const float frequency) { - /* All delay line lengths are calculated to accomodate the full range of - * lengths given their respective paramters. + /* All delay line lengths are calculated to accommodate the full range of + * lengths given their respective parameters. */ size_t totalSamples{0u}; @@ -1017,8 +1052,12 @@ void ReverbPipeline::updateDelayLine(const float earlyDelay, const float lateDel mEarlyDelayTap[i][1] = float2uint((earlyDelay+length) * frequency); mEarlyDelayCoeff[i] = CalcDecayCoeff(length, decayTime); + /* Reduce the late delay tap by the shortest early delay line length to + * compensate for the late line input being fed by the delayed early + * output. + */ length = (LATE_LINE_LENGTHS[i] - LATE_LINE_LENGTHS.front())/float{NUM_LINES}*density_mult + - lateDelay; + std::max(lateDelay - EARLY_LINE_LENGTHS[0]*density_mult, 0.0f); mLateDelayTap[i][1] = float2uint(length * frequency); } } @@ -1028,7 +1067,7 @@ void ReverbPipeline::updateDelayLine(const float earlyDelay, const float lateDel * focal strength. This function results in a B-Format transformation matrix * that spatially focuses the signal in the desired direction. */ -std::array<std::array<float,4>,4> GetTransformFromVector(const float *vec) +std::array<std::array<float,4>,4> GetTransformFromVector(const al::span<const float,3> vec) { /* Normalize the panning vector according to the N3D scale, which has an * extra sqrt(3) term on the directional components. Converting from OpenAL @@ -1041,9 +1080,10 @@ std::array<std::array<float,4>,4> GetTransformFromVector(const float *vec) float mag{std::sqrt(vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2])}; if(mag > 1.0f) { - norm[0] = vec[0] / mag * -al::numbers::sqrt3_v<float>; - norm[1] = vec[1] / mag * al::numbers::sqrt3_v<float>; - norm[2] = vec[2] / mag * al::numbers::sqrt3_v<float>; + const float scale{al::numbers::sqrt3_v<float> / mag}; + norm[0] = vec[0] * -scale; + norm[1] = vec[1] * scale; + norm[2] = vec[2] * scale; mag = 1.0f; } else @@ -1066,8 +1106,9 @@ std::array<std::array<float,4>,4> GetTransformFromVector(const float *vec) } /* Update the early and late 3D panning gains. */ -void ReverbPipeline::update3DPanning(const float *ReflectionsPan, const float *LateReverbPan, - const float earlyGain, const float lateGain, const bool doUpmix, const MixParams *mainMix) +void ReverbPipeline::update3DPanning(const al::span<const float,3> ReflectionsPan, + const al::span<const float,3> LateReverbPan, const float earlyGain, const float lateGain, + const bool doUpmix, const MixParams *mainMix) { /* Create matrices that transform a B-Format signal according to the * panning vectors. @@ -1105,9 +1146,9 @@ void ReverbPipeline::update3DPanning(const float *ReflectionsPan, const float *L auto latecoeffs = mult_matrix(latemat); for(size_t i{0u};i < NUM_LINES;i++) - ComputePanGains(mainMix, earlycoeffs[i].data(), earlyGain, mEarly.TargetGains[i]); + ComputePanGains(mainMix, earlycoeffs[i], earlyGain, mEarly.TargetGains[i]); for(size_t i{0u};i < NUM_LINES;i++) - ComputePanGains(mainMix, latecoeffs[i].data(), lateGain, mLate.TargetGains[i]); + ComputePanGains(mainMix, latecoeffs[i], lateGain, mLate.TargetGains[i]); } else { @@ -1137,9 +1178,9 @@ void ReverbPipeline::update3DPanning(const float *ReflectionsPan, const float *L auto latecoeffs = mult_matrix(LateA2B, latemat); for(size_t i{0u};i < NUM_LINES;i++) - ComputePanGains(mainMix, earlycoeffs[i].data(), earlyGain, mEarly.TargetGains[i]); + ComputePanGains(mainMix, earlycoeffs[i], earlyGain, mEarly.TargetGains[i]); for(size_t i{0u};i < NUM_LINES;i++) - ComputePanGains(mainMix, latecoeffs[i].data(), lateGain, mLate.TargetGains[i]); + ComputePanGains(mainMix, latecoeffs[i], lateGain, mLate.TargetGains[i]); } } @@ -1244,9 +1285,36 @@ void ReverbState::update(const ContextBase *Context, const EffectSlot *Slot, props->Reverb.DecayTime, hfDecayTime, lf0norm, hf0norm, frequency); } - const float decaySamples{(props->Reverb.ReflectionsDelay + props->Reverb.LateReverbDelay - + props->Reverb.DecayTime) * frequency}; - pipeline.mFadeSampleCount = static_cast<size_t>(minf(decaySamples, 1'000'000.0f)); + /* Calculate the gain at the start of the late reverb stage, and the gain + * difference from the decay target (0.001, or -60dB). + */ + const float decayBase{props->Reverb.ReflectionsGain * props->Reverb.LateReverbGain}; + const float decayDiff{ReverbDecayGain / decayBase}; + + if(decayDiff < 1.0f) + { + /* Given the DecayTime (the amount of time for the late reverb to decay + * by -60dB), calculate the time to decay to -60dB from the start of + * the late reverb. + */ + const float diffTime{std::log10(decayDiff)*(20.0f / -60.0f) * props->Reverb.DecayTime}; + + const float decaySamples{(props->Reverb.ReflectionsDelay + props->Reverb.LateReverbDelay + + diffTime) * frequency}; + /* Limit to 100,000 samples (a touch over 2 seconds at 48khz) to + * avoid excessive double-processing. + */ + pipeline.mFadeSampleCount = static_cast<size_t>(minf(decaySamples, 100'000.0f)); + } + else + { + /* Otherwise, if the late reverb already starts at -60dB or less, only + * include the time to get to the late reverb. + */ + const float decaySamples{(props->Reverb.ReflectionsDelay + props->Reverb.LateReverbDelay) + * frequency}; + pipeline.mFadeSampleCount = static_cast<size_t>(minf(decaySamples, 100'000.0f)); + } } @@ -1303,7 +1371,9 @@ inline auto VectorPartialScatter(const std::array<float,NUM_LINES> &RESTRICT in, }}; } -/* Utilizes the above, but reverses the input channels. */ +/* Utilizes the above, but also applies a geometric reflection on the input + * channels. + */ void VectorScatterRevDelayIn(const DelayLineI delay, size_t offset, const float xCoeff, const float yCoeff, const al::span<const ReverbUpdateLine,NUM_LINES> in, const size_t count) { @@ -1314,9 +1384,13 @@ void VectorScatterRevDelayIn(const DelayLineI delay, size_t offset, const float offset &= delay.Mask; size_t td{minz(delay.Mask+1 - offset, count-i)}; do { - std::array<float,NUM_LINES> f; - for(size_t j{0u};j < NUM_LINES;j++) - f[NUM_LINES-1-j] = in[j][i]; + std::array src{in[0][i], in[1][i], in[2][i], in[3][i]}; + std::array f{ + ( src[1] + src[2] + src[3] - src[0]) * 0.5f, + (src[0] + src[2] + src[3] - src[1]) * 0.5f, + (src[0] + src[1] + src[3] - src[2]) * 0.5f, + (src[0] + src[1] + src[2] - src[3]) * 0.5f + }; ++i; delay.Line[offset++] = VectorPartialScatter(f, xCoeff, yCoeff); @@ -1330,9 +1404,6 @@ void VectorScatterRevDelayIn(const DelayLineI delay, size_t offset, const float * It works by vectorizing a regular all-pass filter and replacing the delay * element with a scattering matrix (like the one above) and a diagonal * matrix of delay elements. - * - * Two static specializations are used for transitional (cross-faded) delay - * line processing and non-transitional processing. */ void VecAllpass::process(const al::span<ReverbUpdateLine,NUM_LINES> samples, size_t offset, const float xCoeff, const float yCoeff, const size_t todo) @@ -1379,14 +1450,13 @@ void VecAllpass::process(const al::span<ReverbUpdateLine,NUM_LINES> samples, siz * same direction as the source) from the main delay line. These are * attenuated and all-pass filtered (based on the diffusion parameter). * - * The early lines are then fed in reverse (according to the approximately - * opposite spatial location of the A-Format lines) to create the secondary + * The early lines are then reflected about the origin to create the secondary * reflections (those arriving from the opposite direction as the source). * * The early response is then completed by combining the primary reflections * with the delayed and attenuated output from the early lines. * - * Finally, the early response is reversed, scattered (based on diffusion), + * Finally, the early response is reflected, scattered (based on diffusion), * and fed into the late reverb section of the main delay line. */ void ReverbPipeline::processEarly(size_t offset, const size_t samplesToDo, @@ -1442,8 +1512,7 @@ void ReverbPipeline::processEarly(size_t offset, const size_t samplesToDo, /* Apply a delay and bounce to generate secondary reflections, combine * with the primary reflections and write out the result for mixing. */ - for(size_t j{0u};j < NUM_LINES;j++) - early_delay.write(offset, NUM_LINES-1-j, tempSamples[j].data(), todo); + early_delay.writeReflected(offset, tempSamples, todo); for(size_t j{0u};j < NUM_LINES;j++) { size_t feedb_tap{offset - mEarly.Offset[j]}; @@ -1455,8 +1524,9 @@ void ReverbPipeline::processEarly(size_t offset, const size_t samplesToDo, feedb_tap &= early_delay.Mask; size_t td{minz(early_delay.Mask+1 - feedb_tap, todo - i)}; do { - tempSamples[j][i] += early_delay.Line[feedb_tap++][j]*feedb_coeff; - out[i] = tempSamples[j][i]; + float sample{early_delay.Line[feedb_tap++][j]}; + out[i] = tempSamples[j][i] + sample*feedb_coeff; + tempSamples[j][i] = sample; ++i; } while(--td); } @@ -1475,14 +1545,19 @@ void ReverbPipeline::processEarly(size_t offset, const size_t samplesToDo, void Modulation::calcDelays(size_t todo) { - constexpr float mod_scale{al::numbers::pi_v<float> * 2.0f / MOD_FRACONE}; uint idx{Index}; const uint step{Step}; const float depth{Depth}; for(size_t i{0};i < todo;++i) { idx += step; - const float lfo{std::sin(static_cast<float>(idx&MOD_FRACMASK) * mod_scale)}; + const float x{static_cast<float>(idx&MOD_FRACMASK) * (1.0f/MOD_FRACONE)}; + /* Approximate sin(x*2pi). As long as it roughly fits a sinusoid shape + * and stays within [-1...+1], it needn't be perfect. + */ + const float lfo{!(idx&(MOD_FRACONE>>1)) + ? ((-16.0f * x * x) + (8.0f * x)) + : ((16.0f * x * x) + (-8.0f * x) + (-16.0f * x) + 8.0f)}; ModDelays[i] = (lfo+1.0f) * depth; } Index = idx; |