diff options
Diffstat (limited to 'common')
-rw-r--r-- | common/albit.h | 12 | ||||
-rw-r--r-- | common/albyte.h | 17 | ||||
-rw-r--r-- | common/alcomplex.cpp | 142 | ||||
-rw-r--r-- | common/alcomplex.h | 18 | ||||
-rw-r--r-- | common/aldeque.h | 16 | ||||
-rw-r--r-- | common/almalloc.h | 75 | ||||
-rw-r--r-- | common/alnumbers.h | 16 | ||||
-rw-r--r-- | common/alnumeric.h | 64 | ||||
-rw-r--r-- | common/aloptional.h | 353 | ||||
-rw-r--r-- | common/alsem.cpp (renamed from common/threads.cpp) | 81 | ||||
-rw-r--r-- | common/alsem.h (renamed from common/threads.h) | 27 | ||||
-rw-r--r-- | common/alspan.h | 115 | ||||
-rw-r--r-- | common/althrd_setname.cpp | 76 | ||||
-rw-r--r-- | common/althrd_setname.h | 6 | ||||
-rw-r--r-- | common/comptr.h | 129 | ||||
-rw-r--r-- | common/dynload.cpp | 3 | ||||
-rw-r--r-- | common/opthelpers.h | 3 | ||||
-rw-r--r-- | common/pffft.cpp | 2259 | ||||
-rw-r--r-- | common/pffft.h | 192 | ||||
-rw-r--r-- | common/phase_shifter.h | 39 | ||||
-rw-r--r-- | common/polyphase_resampler.cpp | 76 | ||||
-rw-r--r-- | common/ringbuffer.cpp | 85 | ||||
-rw-r--r-- | common/ringbuffer.h | 39 | ||||
-rw-r--r-- | common/strutils.cpp | 29 | ||||
-rw-r--r-- | common/strutils.h | 12 | ||||
-rw-r--r-- | common/win_main_utf8.h | 13 |
26 files changed, 3004 insertions, 893 deletions
diff --git a/common/albit.h b/common/albit.h index ad596208..82a4a00d 100644 --- a/common/albit.h +++ b/common/albit.h @@ -2,7 +2,9 @@ #define AL_BIT_H #include <cstdint> +#include <cstring> #include <limits> +#include <new> #include <type_traits> #if !defined(__GNUC__) && (defined(_WIN32) || defined(_WIN64)) #include <intrin.h> @@ -10,6 +12,16 @@ namespace al { +template<typename To, typename From> +std::enable_if_t<sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> + && std::is_trivially_copyable_v<To>, +To> bit_cast(const From &src) noexcept +{ + alignas(To) char dst[sizeof(To)]; + std::memcpy(&dst[0], &src, sizeof(To)); + return *std::launder(reinterpret_cast<To*>(&dst[0])); +} + #ifdef __BYTE_ORDER__ enum class endian { little = __ORDER_LITTLE_ENDIAN__, diff --git a/common/albyte.h b/common/albyte.h deleted file mode 100644 index be586869..00000000 --- a/common/albyte.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef AL_BYTE_H -#define AL_BYTE_H - -#include <cstddef> -#include <cstdint> -#include <limits> -#include <type_traits> - -using uint = unsigned int; - -namespace al { - -using byte = unsigned char; - -} // namespace al - -#endif /* AL_BYTE_H */ diff --git a/common/alcomplex.cpp b/common/alcomplex.cpp index 4420a1bb..82a0c43c 100644 --- a/common/alcomplex.cpp +++ b/common/alcomplex.cpp @@ -4,6 +4,7 @@ #include "alcomplex.h" #include <algorithm> +#include <array> #include <cassert> #include <cmath> #include <cstddef> @@ -20,6 +21,7 @@ namespace { using ushort = unsigned short; using ushort2 = std::pair<ushort,ushort>; +using complex_d = std::complex<double>; constexpr size_t BitReverseCounter(size_t log2_size) noexcept { @@ -34,7 +36,7 @@ template<size_t N> struct BitReverser { static_assert(N <= sizeof(ushort)*8, "Too many bits for the bit-reversal table."); - ushort2 mData[BitReverseCounter(N)]{}; + std::array<ushort2,BitReverseCounter(N)> mData{}; constexpr BitReverser() { @@ -44,12 +46,13 @@ struct BitReverser { /* Bit-reversal permutation applied to a sequence of fftsize items. */ for(size_t idx{1u};idx < fftsize-1;++idx) { - size_t revidx{0u}, imask{idx}; - for(size_t i{0};i < N;++i) - { - revidx = (revidx<<1) | (imask&1); - imask >>= 1; - } + size_t revidx{idx}; + revidx = ((revidx&0xaaaaaaaa) >> 1) | ((revidx&0x55555555) << 1); + revidx = ((revidx&0xcccccccc) >> 2) | ((revidx&0x33333333) << 2); + revidx = ((revidx&0xf0f0f0f0) >> 4) | ((revidx&0x0f0f0f0f) << 4); + revidx = ((revidx&0xff00ff00) >> 8) | ((revidx&0x00ff00ff) << 8); + revidx = (revidx >> 16) | ((revidx&0x0000ffff) << 16); + revidx >>= 32-N; if(idx < revidx) { @@ -58,14 +61,13 @@ struct BitReverser { ++ret_i; } } - assert(ret_i == al::size(mData)); + assert(ret_i == std::size(mData)); } }; -/* These bit-reversal swap tables support up to 10-bit indices (1024 elements), - * which is the largest used by OpenAL Soft's filters and effects. Larger FFT - * requests, used by some utilities where performance is less important, will - * use a slower table-less path. +/* These bit-reversal swap tables support up to 11-bit indices (2048 elements), + * which is large enough for the filters and effects in OpenAL Soft. Larger FFT + * requests will use a slower table-less path. */ constexpr BitReverser<2> BitReverser2{}; constexpr BitReverser<3> BitReverser3{}; @@ -76,7 +78,8 @@ constexpr BitReverser<7> BitReverser7{}; constexpr BitReverser<8> BitReverser8{}; constexpr BitReverser<9> BitReverser9{}; constexpr BitReverser<10> BitReverser10{}; -constexpr std::array<al::span<const ushort2>,11> gBitReverses{{ +constexpr BitReverser<11> BitReverser11{}; +constexpr std::array<al::span<const ushort2>,12> gBitReverses{{ {}, {}, BitReverser2.mData, BitReverser3.mData, @@ -86,14 +89,29 @@ constexpr std::array<al::span<const ushort2>,11> gBitReverses{{ BitReverser7.mData, BitReverser8.mData, BitReverser9.mData, - BitReverser10.mData + BitReverser10.mData, + BitReverser11.mData +}}; + +/* Lookup table for std::polar(1, pi / (1<<index)); */ +template<typename T> +constexpr std::array<std::complex<T>,gBitReverses.size()-1> gArgAngle{{ + {static_cast<T>(-1.00000000000000000e+00), static_cast<T>(0.00000000000000000e+00)}, + {static_cast<T>( 0.00000000000000000e+00), static_cast<T>(1.00000000000000000e+00)}, + {static_cast<T>( 7.07106781186547524e-01), static_cast<T>(7.07106781186547524e-01)}, + {static_cast<T>( 9.23879532511286756e-01), static_cast<T>(3.82683432365089772e-01)}, + {static_cast<T>( 9.80785280403230449e-01), static_cast<T>(1.95090322016128268e-01)}, + {static_cast<T>( 9.95184726672196886e-01), static_cast<T>(9.80171403295606020e-02)}, + {static_cast<T>( 9.98795456205172393e-01), static_cast<T>(4.90676743274180143e-02)}, + {static_cast<T>( 9.99698818696204220e-01), static_cast<T>(2.45412285229122880e-02)}, + {static_cast<T>( 9.99924701839144541e-01), static_cast<T>(1.22715382857199261e-02)}, + {static_cast<T>( 9.99981175282601143e-01), static_cast<T>(6.13588464915447536e-03)}, + {static_cast<T>( 9.99995293809576172e-01), static_cast<T>(3.06795676296597627e-03)} }}; } // namespace -template<typename Real> -std::enable_if_t<std::is_floating_point<Real>::value> -complex_fft(const al::span<std::complex<Real>> buffer, const al::type_identity_t<Real> sign) +void complex_fft(const al::span<std::complex<double>> buffer, const double sign) { const size_t fftsize{buffer.size()}; /* Get the number of bits used for indexing. Simplifies bit-reversal and @@ -101,48 +119,82 @@ complex_fft(const al::span<std::complex<Real>> buffer, const al::type_identity_t */ const size_t log2_size{static_cast<size_t>(al::countr_zero(fftsize))}; - if(log2_size >= gBitReverses.size()) UNLIKELY + if(log2_size < gBitReverses.size()) LIKELY { - for(size_t idx{1u};idx < fftsize-1;++idx) + for(auto &rev : gBitReverses[log2_size]) + std::swap(buffer[rev.first], buffer[rev.second]); + + /* Iterative form of Danielson-Lanczos lemma */ + for(size_t i{0};i < log2_size;++i) { - size_t revidx{0u}, imask{idx}; - for(size_t i{0};i < log2_size;++i) + const size_t step2{1_uz << i}; + const size_t step{2_uz << i}; + /* The first iteration of the inner loop would have u=1, which we + * can simplify to remove a number of complex multiplies. + */ + for(size_t k{0};k < fftsize;k+=step) { - revidx = (revidx<<1) | (imask&1); - imask >>= 1; + const complex_d temp{buffer[k+step2]}; + buffer[k+step2] = buffer[k] - temp; + buffer[k] += temp; } - if(idx < revidx) - std::swap(buffer[idx], buffer[revidx]); + const complex_d w{gArgAngle<double>[i].real(), gArgAngle<double>[i].imag()*sign}; + complex_d u{w}; + for(size_t j{1};j < step2;j++) + { + for(size_t k{j};k < fftsize;k+=step) + { + const complex_d temp{buffer[k+step2] * u}; + buffer[k+step2] = buffer[k] - temp; + buffer[k] += temp; + } + u *= w; + } } } - else for(auto &rev : gBitReverses[log2_size]) - std::swap(buffer[rev.first], buffer[rev.second]); - - /* Iterative form of Danielson-Lanczos lemma */ - const Real pi{al::numbers::pi_v<Real> * sign}; - size_t step2{1u}; - for(size_t i{0};i < log2_size;++i) + else { - const Real arg{pi / static_cast<Real>(step2)}; + for(size_t idx{1u};idx < fftsize-1;++idx) + { + size_t revidx{idx}; + revidx = ((revidx&0xaaaaaaaa) >> 1) | ((revidx&0x55555555) << 1); + revidx = ((revidx&0xcccccccc) >> 2) | ((revidx&0x33333333) << 2); + revidx = ((revidx&0xf0f0f0f0) >> 4) | ((revidx&0x0f0f0f0f) << 4); + revidx = ((revidx&0xff00ff00) >> 8) | ((revidx&0x00ff00ff) << 8); + revidx = (revidx >> 16) | ((revidx&0x0000ffff) << 16); + revidx >>= 32-log2_size; + + if(idx < revidx) + std::swap(buffer[idx], buffer[revidx]); + } - /* TODO: Would std::polar(1.0, arg) be any better? */ - const std::complex<Real> w{std::cos(arg), std::sin(arg)}; - std::complex<Real> u{1.0, 0.0}; - const size_t step{step2 << 1}; - for(size_t j{0};j < step2;j++) + const double pi{al::numbers::pi * sign}; + for(size_t i{0};i < log2_size;++i) { - for(size_t k{j};k < fftsize;k+=step) + const size_t step2{1_uz << i}; + const size_t step{2_uz << i}; + for(size_t k{0};k < fftsize;k+=step) { - std::complex<Real> temp{buffer[k+step2] * u}; + const complex_d temp{buffer[k+step2]}; buffer[k+step2] = buffer[k] - temp; buffer[k] += temp; } - u *= w; + const double arg{pi / static_cast<double>(step2)}; + const complex_d w{std::polar(1.0, arg)}; + complex_d u{w}; + for(size_t j{1};j < step2;j++) + { + for(size_t k{j};k < fftsize;k+=step) + { + const complex_d temp{buffer[k+step2] * u}; + buffer[k+step2] = buffer[k] - temp; + buffer[k] += temp; + } + u *= w; + } } - - step2 <<= 1; } } @@ -165,7 +217,3 @@ void complex_hilbert(const al::span<std::complex<double>> buffer) forward_fft(buffer); } - - -template void complex_fft<>(const al::span<std::complex<float>> buffer, const float sign); -template void complex_fft<>(const al::span<std::complex<double>> buffer, const double sign); diff --git a/common/alcomplex.h b/common/alcomplex.h index 794c3526..f730ba9e 100644 --- a/common/alcomplex.h +++ b/common/alcomplex.h @@ -11,27 +11,23 @@ * FFT and 1 is inverse FFT. Applies the Discrete Fourier Transform (DFT) to * the data supplied in the buffer, which MUST BE power of two. */ -template<typename Real> -std::enable_if_t<std::is_floating_point<Real>::value> -complex_fft(const al::span<std::complex<Real>> buffer, const al::type_identity_t<Real> sign); +void complex_fft(const al::span<std::complex<double>> buffer, const double sign); /** * Calculate the frequency-domain response of the time-domain signal in the * provided buffer, which MUST BE power of two. */ -template<typename Real, size_t N> -std::enable_if_t<std::is_floating_point<Real>::value> -forward_fft(const al::span<std::complex<Real>,N> buffer) -{ complex_fft(buffer.subspan(0), -1); } +template<size_t N> +void forward_fft(const al::span<std::complex<double>,N> buffer) +{ complex_fft(al::span<std::complex<double>>{buffer}, -1.0); } /** * Calculate the time-domain signal of the frequency-domain response in the * provided buffer, which MUST BE power of two. */ -template<typename Real, size_t N> -std::enable_if_t<std::is_floating_point<Real>::value> -inverse_fft(const al::span<std::complex<Real>,N> buffer) -{ complex_fft(buffer.subspan(0), 1); } +template<size_t N> +void inverse_fft(const al::span<std::complex<double>,N> buffer) +{ complex_fft(al::span<std::complex<double>>{buffer}, +1.0); } /** * Calculate the complex helical sequence (discrete-time analytical signal) of diff --git a/common/aldeque.h b/common/aldeque.h deleted file mode 100644 index 3f99bf00..00000000 --- a/common/aldeque.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef ALDEQUE_H -#define ALDEQUE_H - -#include <deque> - -#include "almalloc.h" - - -namespace al { - -template<typename T> -using deque = std::deque<T, al::allocator<T>>; - -} // namespace al - -#endif /* ALDEQUE_H */ diff --git a/common/almalloc.h b/common/almalloc.h index a795fc3b..873473ca 100644 --- a/common/almalloc.h +++ b/common/almalloc.h @@ -117,7 +117,9 @@ constexpr T *to_address(T *p) noexcept template<typename T> constexpr auto to_address(const T &p) noexcept -{ return to_address(p.operator->()); } +{ + return ::al::to_address(p.operator->()); +} template<typename T, typename ...Args> @@ -125,71 +127,6 @@ constexpr T* construct_at(T *ptr, Args&& ...args) noexcept(std::is_nothrow_constructible<T, Args...>::value) { return ::new(static_cast<void*>(ptr)) T{std::forward<Args>(args)...}; } -/* At least VS 2015 complains that 'ptr' is unused when the given type's - * destructor is trivial (a no-op). So disable that warning for this call. - */ -DIAGNOSTIC_PUSH -msc_pragma(warning(disable : 4100)) -template<typename T> -constexpr std::enable_if_t<!std::is_array<T>::value> -destroy_at(T *ptr) noexcept(std::is_nothrow_destructible<T>::value) -{ ptr->~T(); } -DIAGNOSTIC_POP -template<typename T> -constexpr std::enable_if_t<std::is_array<T>::value> -destroy_at(T *ptr) noexcept(std::is_nothrow_destructible<std::remove_all_extents_t<T>>::value) -{ - for(auto &elem : *ptr) - al::destroy_at(std::addressof(elem)); -} - -template<typename T> -constexpr void destroy(T first, T end) noexcept(noexcept(al::destroy_at(std::addressof(*first)))) -{ - while(first != end) - { - al::destroy_at(std::addressof(*first)); - ++first; - } -} - -template<typename T, typename N> -constexpr std::enable_if_t<std::is_integral<N>::value,T> -destroy_n(T first, N count) noexcept(noexcept(al::destroy_at(std::addressof(*first)))) -{ - if(count != 0) - { - do { - al::destroy_at(std::addressof(*first)); - ++first; - } while(--count); - } - return first; -} - - -template<typename T, typename N> -inline std::enable_if_t<std::is_integral<N>::value, -T> uninitialized_default_construct_n(T first, N count) -{ - using ValueT = typename std::iterator_traits<T>::value_type; - T current{first}; - if(count != 0) - { - try { - do { - ::new(static_cast<void*>(std::addressof(*current))) ValueT; - ++current; - } while(--count); - } - catch(...) { - al::destroy(first, current); - throw; - } - } - return current; -} - /* Storage for flexible array data. This is trivially destructible if type T is * trivially destructible. @@ -209,7 +146,7 @@ struct FlexArrayStorage { } FlexArrayStorage(size_t size) : mSize{size} - { al::uninitialized_default_construct_n(mArray, mSize); } + { std::uninitialized_default_construct_n(mArray, mSize); } ~FlexArrayStorage() = default; FlexArrayStorage(const FlexArrayStorage&) = delete; @@ -231,8 +168,8 @@ struct FlexArrayStorage<T,alignment,false> { } FlexArrayStorage(size_t size) : mSize{size} - { al::uninitialized_default_construct_n(mArray, mSize); } - ~FlexArrayStorage() { al::destroy_n(mArray, mSize); } + { std::uninitialized_default_construct_n(mArray, mSize); } + ~FlexArrayStorage() { std::destroy_n(mArray, mSize); } FlexArrayStorage(const FlexArrayStorage&) = delete; FlexArrayStorage& operator=(const FlexArrayStorage&) = delete; diff --git a/common/alnumbers.h b/common/alnumbers.h index 37a55410..e92d7b87 100644 --- a/common/alnumbers.h +++ b/common/alnumbers.h @@ -13,21 +13,21 @@ namespace detail_ { } // detail_ template<typename T> -static constexpr auto pi_v = detail_::as_fp<T>(3.141592653589793238462643383279502884L); +inline constexpr auto pi_v = detail_::as_fp<T>(3.141592653589793238462643383279502884L); template<typename T> -static constexpr auto inv_pi_v = detail_::as_fp<T>(0.318309886183790671537767526745028724L); +inline constexpr auto inv_pi_v = detail_::as_fp<T>(0.318309886183790671537767526745028724L); template<typename T> -static constexpr auto sqrt2_v = detail_::as_fp<T>(1.414213562373095048801688724209698079L); +inline constexpr auto sqrt2_v = detail_::as_fp<T>(1.414213562373095048801688724209698079L); template<typename T> -static constexpr auto sqrt3_v = detail_::as_fp<T>(1.732050807568877293527446341505872367L); +inline constexpr auto sqrt3_v = detail_::as_fp<T>(1.732050807568877293527446341505872367L); -static constexpr auto pi = pi_v<double>; -static constexpr auto inv_pi = inv_pi_v<double>; -static constexpr auto sqrt2 = sqrt2_v<double>; -static constexpr auto sqrt3 = sqrt3_v<double>; +inline constexpr auto pi = pi_v<double>; +inline constexpr auto inv_pi = inv_pi_v<double>; +inline constexpr auto sqrt2 = sqrt2_v<double>; +inline constexpr auto sqrt3 = sqrt3_v<double>; } // namespace numbers diff --git a/common/alnumeric.h b/common/alnumeric.h index d6919e40..6281b012 100644 --- a/common/alnumeric.h +++ b/common/alnumeric.h @@ -5,6 +5,7 @@ #include <cmath> #include <cstddef> #include <cstdint> +#include <type_traits> #ifdef HAVE_INTRIN_H #include <intrin.h> #endif @@ -12,12 +13,18 @@ #include <xmmintrin.h> #endif +#include "albit.h" #include "altraits.h" #include "opthelpers.h" -inline constexpr int64_t operator "" _i64(unsigned long long int n) noexcept { return static_cast<int64_t>(n); } -inline constexpr uint64_t operator "" _u64(unsigned long long int n) noexcept { return static_cast<uint64_t>(n); } +constexpr auto operator "" _i64(unsigned long long n) noexcept { return static_cast<int64_t>(n); } +constexpr auto operator "" _u64(unsigned long long n) noexcept { return static_cast<uint64_t>(n); } + +constexpr auto operator "" _z(unsigned long long n) noexcept +{ return static_cast<std::make_signed_t<size_t>>(n); } +constexpr auto operator "" _uz(unsigned long long n) noexcept { return static_cast<size_t>(n); } +constexpr auto operator "" _zu(unsigned long long n) noexcept { return static_cast<size_t>(n); } constexpr inline float minf(float a, float b) noexcept @@ -82,6 +89,9 @@ constexpr inline float cubic(float val1, float val2, float val3, float val4, flo return val1*a0 + val2*a1 + val3*a2 + val4*a3; } +constexpr inline double lerpd(double val1, double val2, double mu) noexcept +{ return val1 + (val2-val1)*mu; } + /** Find the next power-of-2 for non-power-of-2 numbers. */ inline uint32_t NextPowerOf2(uint32_t value) noexcept @@ -159,21 +169,16 @@ inline int float2int(float f) noexcept #elif (defined(_MSC_VER) && defined(_M_IX86_FP) && _M_IX86_FP == 0) \ || ((defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) \ && !defined(__SSE_MATH__)) - int sign, shift, mant; - union { - float f; - int i; - } conv; + const int conv_i{al::bit_cast<int>(f)}; - conv.f = f; - sign = (conv.i>>31) | 1; - shift = ((conv.i>>23)&0xff) - (127+23); + const int sign{(conv_i>>31) | 1}; + const int shift{((conv_i>>23)&0xff) - (127+23)}; /* Over/underflow */ if(shift >= 31 || shift < -23) UNLIKELY return 0; - mant = (conv.i&0x7fffff) | 0x800000; + const int mant{(conv_i&0x7fffff) | 0x800000}; if(shift < 0) LIKELY return (mant >> -shift) * sign; return (mant << shift) * sign; @@ -195,25 +200,19 @@ inline int double2int(double d) noexcept #elif (defined(_MSC_VER) && defined(_M_IX86_FP) && _M_IX86_FP < 2) \ || ((defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) \ && !defined(__SSE2_MATH__)) - int sign, shift; - int64_t mant; - union { - double d; - int64_t i64; - } conv; + const int64_t conv_i64{al::bit_cast<int64_t>(d)}; - conv.d = d; - sign = (conv.i64 >> 63) | 1; - shift = ((conv.i64 >> 52) & 0x7ff) - (1023 + 52); + const int sign{static_cast<int>(conv_i64 >> 63) | 1}; + const int shift{(static_cast<int>(conv_i64 >> 52) & 0x7ff) - (1023 + 52)}; /* Over/underflow */ if(shift >= 63 || shift < -52) UNLIKELY return 0; - mant = (conv.i64 & 0xfffffffffffff_i64) | 0x10000000000000_i64; + const int64_t mant{(conv_i64 & 0xfffffffffffff_i64) | 0x10000000000000_i64}; if(shift < 0) LIKELY - return (int)(mant >> -shift) * sign; - return (int)(mant << shift) * sign; + return static_cast<int>(mant >> -shift) * sign; + return static_cast<int>(mant << shift) * sign; #else @@ -246,19 +245,14 @@ inline float fast_roundf(float f) noexcept /* Integral limit, where sub-integral precision is not available for * floats. */ - static const float ilim[2]{ + static constexpr float ilim[2]{ 8388608.0f /* 0x1.0p+23 */, -8388608.0f /* -0x1.0p+23 */ }; - unsigned int sign, expo; - union { - float f; - unsigned int i; - } conv; + const unsigned int conv_i{al::bit_cast<unsigned int>(f)}; - conv.f = f; - sign = (conv.i>>31)&0x01; - expo = (conv.i>>23)&0xff; + const unsigned int sign{(conv_i>>31)&0x01}; + const unsigned int expo{(conv_i>>23)&0xff}; if(expo >= 150/*+23*/) UNLIKELY { @@ -283,12 +277,6 @@ inline float fast_roundf(float f) noexcept } -template<typename T> -constexpr const T& clamp(const T& value, const T& min_value, const T& max_value) noexcept -{ - return std::min(std::max(value, min_value), max_value); -} - // Converts level (mB) to gain. inline float level_mb_to_gain(float x) { diff --git a/common/aloptional.h b/common/aloptional.h deleted file mode 100644 index 6de16799..00000000 --- a/common/aloptional.h +++ /dev/null @@ -1,353 +0,0 @@ -#ifndef AL_OPTIONAL_H -#define AL_OPTIONAL_H - -#include <initializer_list> -#include <type_traits> -#include <utility> - -#include "almalloc.h" - -namespace al { - -struct nullopt_t { }; -struct in_place_t { }; - -constexpr nullopt_t nullopt{}; -constexpr in_place_t in_place{}; - -#define NOEXCEPT_AS(...) noexcept(noexcept(__VA_ARGS__)) - -namespace detail_ { -/* Base storage struct for an optional. Defines a trivial destructor, for types - * that can be trivially destructed. - */ -template<typename T, bool = std::is_trivially_destructible<T>::value> -struct optstore_base { - bool mHasValue{false}; - union { - char mDummy{}; - T mValue; - }; - - constexpr optstore_base() noexcept { } - template<typename ...Args> - constexpr explicit optstore_base(in_place_t, Args&& ...args) - noexcept(std::is_nothrow_constructible<T, Args...>::value) - : mHasValue{true}, mValue{std::forward<Args>(args)...} - { } - ~optstore_base() = default; -}; - -/* Specialization needing a non-trivial destructor. */ -template<typename T> -struct optstore_base<T, false> { - bool mHasValue{false}; - union { - char mDummy{}; - T mValue; - }; - - constexpr optstore_base() noexcept { } - template<typename ...Args> - constexpr explicit optstore_base(in_place_t, Args&& ...args) - noexcept(std::is_nothrow_constructible<T, Args...>::value) - : mHasValue{true}, mValue{std::forward<Args>(args)...} - { } - ~optstore_base() { if(mHasValue) al::destroy_at(std::addressof(mValue)); } -}; - -/* Next level of storage, which defines helpers to construct and destruct the - * stored object. - */ -template<typename T> -struct optstore_helper : public optstore_base<T> { - using optstore_base<T>::optstore_base; - - template<typename... Args> - constexpr void construct(Args&& ...args) noexcept(std::is_nothrow_constructible<T, Args...>::value) - { - al::construct_at(std::addressof(this->mValue), std::forward<Args>(args)...); - this->mHasValue = true; - } - - constexpr void reset() noexcept - { - if(this->mHasValue) - al::destroy_at(std::addressof(this->mValue)); - this->mHasValue = false; - } - - constexpr void assign(const optstore_helper &rhs) - noexcept(std::is_nothrow_copy_constructible<T>::value - && std::is_nothrow_copy_assignable<T>::value) - { - if(!rhs.mHasValue) - this->reset(); - else if(this->mHasValue) - this->mValue = rhs.mValue; - else - this->construct(rhs.mValue); - } - - constexpr void assign(optstore_helper&& rhs) - noexcept(std::is_nothrow_move_constructible<T>::value - && std::is_nothrow_move_assignable<T>::value) - { - if(!rhs.mHasValue) - this->reset(); - else if(this->mHasValue) - this->mValue = std::move(rhs.mValue); - else - this->construct(std::move(rhs.mValue)); - } -}; - -/* Define copy and move constructors and assignment operators, which may or may - * not be trivial. - */ -template<typename T, bool trivial_copy = std::is_trivially_copy_constructible<T>::value, - bool trivial_move = std::is_trivially_move_constructible<T>::value, - /* Trivial assignment is dependent on trivial construction+destruction. */ - bool = trivial_copy && std::is_trivially_copy_assignable<T>::value - && std::is_trivially_destructible<T>::value, - bool = trivial_move && std::is_trivially_move_assignable<T>::value - && std::is_trivially_destructible<T>::value> -struct optional_storage; - -/* Some versions of GCC have issues with 'this' in the following noexcept(...) - * statements, so this macro is a workaround. - */ -#define _this std::declval<optional_storage*>() - -/* Completely trivial. */ -template<typename T> -struct optional_storage<T, true, true, true, true> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage&) = default; - constexpr optional_storage& operator=(optional_storage&&) = default; -}; - -/* Non-trivial move assignment. */ -template<typename T> -struct optional_storage<T, true, true, true, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage&) = default; - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -/* Non-trivial move construction. */ -template<typename T> -struct optional_storage<T, true, false, true, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&& rhs) NOEXCEPT_AS(_this->construct(std::move(rhs.mValue))) - { if(rhs.mHasValue) this->construct(std::move(rhs.mValue)); } - constexpr optional_storage& operator=(const optional_storage&) = default; - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -/* Non-trivial copy assignment. */ -template<typename T> -struct optional_storage<T, true, true, false, true> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&&) = default; -}; - -/* Non-trivial copy construction. */ -template<typename T> -struct optional_storage<T, false, true, false, true> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage &rhs) NOEXCEPT_AS(_this->construct(rhs.mValue)) - { if(rhs.mHasValue) this->construct(rhs.mValue); } - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&&) = default; -}; - -/* Non-trivial assignment. */ -template<typename T> -struct optional_storage<T, true, true, false, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -/* Non-trivial assignment, non-trivial move construction. */ -template<typename T> -struct optional_storage<T, true, false, false, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage&) = default; - constexpr optional_storage(optional_storage&& rhs) NOEXCEPT_AS(_this->construct(std::move(rhs.mValue))) - { if(rhs.mHasValue) this->construct(std::move(rhs.mValue)); } - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -/* Non-trivial assignment, non-trivial copy construction. */ -template<typename T> -struct optional_storage<T, false, true, false, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage &rhs) NOEXCEPT_AS(_this->construct(rhs.mValue)) - { if(rhs.mHasValue) this->construct(rhs.mValue); } - constexpr optional_storage(optional_storage&&) = default; - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -/* Completely non-trivial. */ -template<typename T> -struct optional_storage<T, false, false, false, false> : public optstore_helper<T> { - using optstore_helper<T>::optstore_helper; - constexpr optional_storage() noexcept = default; - constexpr optional_storage(const optional_storage &rhs) NOEXCEPT_AS(_this->construct(rhs.mValue)) - { if(rhs.mHasValue) this->construct(rhs.mValue); } - constexpr optional_storage(optional_storage&& rhs) NOEXCEPT_AS(_this->construct(std::move(rhs.mValue))) - { if(rhs.mHasValue) this->construct(std::move(rhs.mValue)); } - constexpr optional_storage& operator=(const optional_storage &rhs) NOEXCEPT_AS(_this->assign(rhs)) - { this->assign(rhs); return *this; } - constexpr optional_storage& operator=(optional_storage&& rhs) NOEXCEPT_AS(_this->assign(std::move(rhs))) - { this->assign(std::move(rhs)); return *this; } -}; - -#undef _this - -} // namespace detail_ - -#define REQUIRES(...) std::enable_if_t<(__VA_ARGS__),bool> = true - -template<typename T> -class optional { - using storage_t = detail_::optional_storage<T>; - - storage_t mStore{}; - -public: - using value_type = T; - - constexpr optional() = default; - constexpr optional(const optional&) = default; - constexpr optional(optional&&) = default; - constexpr optional(nullopt_t) noexcept { } - template<typename ...Args> - constexpr explicit optional(in_place_t, Args&& ...args) - NOEXCEPT_AS(storage_t{al::in_place, std::forward<Args>(args)...}) - : mStore{al::in_place, std::forward<Args>(args)...} - { } - template<typename U, REQUIRES(std::is_constructible<T, U&&>::value - && !std::is_same<std::decay_t<U>, al::in_place_t>::value - && !std::is_same<std::decay_t<U>, optional<T>>::value - && std::is_convertible<U&&, T>::value)> - constexpr optional(U&& rhs) NOEXCEPT_AS(storage_t{al::in_place, std::forward<U>(rhs)}) - : mStore{al::in_place, std::forward<U>(rhs)} - { } - template<typename U, REQUIRES(std::is_constructible<T, U&&>::value - && !std::is_same<std::decay_t<U>, al::in_place_t>::value - && !std::is_same<std::decay_t<U>, optional<T>>::value - && !std::is_convertible<U&&, T>::value)> - constexpr explicit optional(U&& rhs) NOEXCEPT_AS(storage_t{al::in_place, std::forward<U>(rhs)}) - : mStore{al::in_place, std::forward<U>(rhs)} - { } - ~optional() = default; - - constexpr optional& operator=(const optional&) = default; - constexpr optional& operator=(optional&&) = default; - constexpr optional& operator=(nullopt_t) noexcept { mStore.reset(); return *this; } - template<typename U=T> - constexpr std::enable_if_t<std::is_constructible<T, U>::value - && std::is_assignable<T&, U>::value - && !std::is_same<std::decay_t<U>, optional<T>>::value - && (!std::is_same<std::decay_t<U>, T>::value || !std::is_scalar<U>::value), - optional&> operator=(U&& rhs) - { - if(mStore.mHasValue) - mStore.mValue = std::forward<U>(rhs); - else - mStore.construct(std::forward<U>(rhs)); - return *this; - } - - constexpr const T* operator->() const { return std::addressof(mStore.mValue); } - constexpr T* operator->() { return std::addressof(mStore.mValue); } - constexpr const T& operator*() const& { return mStore.mValue; } - constexpr T& operator*() & { return mStore.mValue; } - constexpr const T&& operator*() const&& { return std::move(mStore.mValue); } - constexpr T&& operator*() && { return std::move(mStore.mValue); } - - constexpr explicit operator bool() const noexcept { return mStore.mHasValue; } - constexpr bool has_value() const noexcept { return mStore.mHasValue; } - - constexpr T& value() & { return mStore.mValue; } - constexpr const T& value() const& { return mStore.mValue; } - constexpr T&& value() && { return std::move(mStore.mValue); } - constexpr const T&& value() const&& { return std::move(mStore.mValue); } - - template<typename U> - constexpr T value_or(U&& defval) const& - { return bool(*this) ? **this : static_cast<T>(std::forward<U>(defval)); } - template<typename U> - constexpr T value_or(U&& defval) && - { return bool(*this) ? std::move(**this) : static_cast<T>(std::forward<U>(defval)); } - - template<typename ...Args> - constexpr T& emplace(Args&& ...args) - { - mStore.reset(); - mStore.construct(std::forward<Args>(args)...); - return mStore.mValue; - } - template<typename U, typename ...Args> - constexpr std::enable_if_t<std::is_constructible<T, std::initializer_list<U>&, Args&&...>::value, - T&> emplace(std::initializer_list<U> il, Args&& ...args) - { - mStore.reset(); - mStore.construct(il, std::forward<Args>(args)...); - return mStore.mValue; - } - - constexpr void reset() noexcept { mStore.reset(); } -}; - -template<typename T> -constexpr optional<std::decay_t<T>> make_optional(T&& arg) -{ return optional<std::decay_t<T>>{in_place, std::forward<T>(arg)}; } - -template<typename T, typename... Args> -constexpr optional<T> make_optional(Args&& ...args) -{ return optional<T>{in_place, std::forward<Args>(args)...}; } - -template<typename T, typename U, typename... Args> -constexpr optional<T> make_optional(std::initializer_list<U> il, Args&& ...args) -{ return optional<T>{in_place, il, std::forward<Args>(args)...}; } - -#undef REQUIRES -#undef NOEXCEPT_AS -} // namespace al - -#endif /* AL_OPTIONAL_H */ diff --git a/common/threads.cpp b/common/alsem.cpp index 19a6bbf0..6a92b35c 100644 --- a/common/threads.cpp +++ b/common/alsem.cpp @@ -20,11 +20,12 @@ #include "config.h" -#include "opthelpers.h" -#include "threads.h" +#include "alsem.h" #include <system_error> +#include "opthelpers.h" + #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN @@ -32,37 +33,6 @@ #include <limits> -void althrd_setname(const char *name) -{ -#if defined(_MSC_VER) && !defined(_M_ARM) - -#define MS_VC_EXCEPTION 0x406D1388 -#pragma pack(push,8) - struct { - DWORD dwType; // Must be 0x1000. - LPCSTR szName; // Pointer to name (in user addr space). - DWORD dwThreadID; // Thread ID (-1=caller thread). - DWORD dwFlags; // Reserved for future use, must be zero. - } info; -#pragma pack(pop) - info.dwType = 0x1000; - info.szName = name; - info.dwThreadID = ~DWORD{0}; - info.dwFlags = 0; - - __try { - RaiseException(MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); - } - __except(EXCEPTION_CONTINUE_EXECUTION) { - } -#undef MS_VC_EXCEPTION - -#else - - (void)name; -#endif -} - namespace al { semaphore::semaphore(unsigned int initial) @@ -93,49 +63,8 @@ bool semaphore::try_wait() noexcept #else -#include <pthread.h> -#ifdef HAVE_PTHREAD_NP_H -#include <pthread_np.h> -#endif -#include <tuple> - -namespace { - -using setname_t1 = int(*)(const char*); -using setname_t2 = int(*)(pthread_t, const char*); -using setname_t3 = void(*)(pthread_t, const char*); -using setname_t4 = int(*)(pthread_t, const char*, void*); - -void setname_caller(setname_t1 func, const char *name) -{ func(name); } - -void setname_caller(setname_t2 func, const char *name) -{ func(pthread_self(), name); } - -void setname_caller(setname_t3 func, const char *name) -{ func(pthread_self(), name); } - -void setname_caller(setname_t4 func, const char *name) -{ func(pthread_self(), "%s", static_cast<void*>(const_cast<char*>(name))); } - -} // namespace - -void althrd_setname(const char *name) -{ -#if defined(HAVE_PTHREAD_SET_NAME_NP) - setname_caller(pthread_set_name_np, name); -#elif defined(HAVE_PTHREAD_SETNAME_NP) - setname_caller(pthread_setname_np, name); -#endif - /* Avoid unused function/parameter warnings. */ - std::ignore = name; - std::ignore = static_cast<void(*)(setname_t1,const char*)>(&setname_caller); - std::ignore = static_cast<void(*)(setname_t2,const char*)>(&setname_caller); - std::ignore = static_cast<void(*)(setname_t3,const char*)>(&setname_caller); - std::ignore = static_cast<void(*)(setname_t4,const char*)>(&setname_caller); -} - -#ifdef __APPLE__ +/* Do not try using libdispatch on systems where it is absent. */ +#if defined(AL_APPLE_HAVE_DISPATCH) namespace al { diff --git a/common/threads.h b/common/alsem.h index 1cdb5d8f..9f72d1c6 100644 --- a/common/threads.h +++ b/common/alsem.h @@ -1,30 +1,25 @@ -#ifndef AL_THREADS_H -#define AL_THREADS_H - -#if defined(__GNUC__) && defined(__i386__) -/* force_align_arg_pointer is required for proper function arguments aligning - * when SSE code is used. Some systems (Windows, QNX) do not guarantee our - * thread functions will be properly aligned on the stack, even though GCC may - * generate code with the assumption that it is. */ -#define FORCE_ALIGN __attribute__((force_align_arg_pointer)) -#else -#define FORCE_ALIGN -#endif +#ifndef COMMON_ALSEM_H +#define COMMON_ALSEM_H #if defined(__APPLE__) +#include <AvailabilityMacros.h> +#include <TargetConditionals.h> +#if (((MAC_OS_X_VERSION_MIN_REQUIRED > 1050) && !defined(__ppc__)) || TARGET_OS_IOS || TARGET_OS_TV) #include <dispatch/dispatch.h> +#define AL_APPLE_HAVE_DISPATCH 1 +#else +#include <semaphore.h> /* Fallback option for Apple without a working libdispatch */ +#endif #elif !defined(_WIN32) #include <semaphore.h> #endif -void althrd_setname(const char *name); - namespace al { class semaphore { #ifdef _WIN32 using native_type = void*; -#elif defined(__APPLE__) +#elif defined(AL_APPLE_HAVE_DISPATCH) using native_type = dispatch_semaphore_t; #else using native_type = sem_t; @@ -45,4 +40,4 @@ public: } // namespace al -#endif /* AL_THREADS_H */ +#endif /* COMMON_ALSEM_H */ diff --git a/common/alspan.h b/common/alspan.h index 1d6cdfe5..341ce7c8 100644 --- a/common/alspan.h +++ b/common/alspan.h @@ -12,41 +12,12 @@ namespace al { -template<typename T> -constexpr auto size(const T &cont) noexcept(noexcept(cont.size())) -> decltype(cont.size()) -{ return cont.size(); } - -template<typename T, size_t N> -constexpr size_t size(const T (&)[N]) noexcept -{ return N; } - - -template<typename T> -constexpr auto data(T &cont) noexcept(noexcept(cont.data())) -> decltype(cont.data()) -{ return cont.data(); } - -template<typename T> -constexpr auto data(const T &cont) noexcept(noexcept(cont.data())) -> decltype(cont.data()) -{ return cont.data(); } - -template<typename T, size_t N> -constexpr T* data(T (&arr)[N]) noexcept -{ return arr; } - -template<typename T> -constexpr const T* data(std::initializer_list<T> list) noexcept -{ return list.begin(); } - - constexpr size_t dynamic_extent{static_cast<size_t>(-1)}; template<typename T, size_t E=dynamic_extent> class span; namespace detail_ { - template<typename... Ts> - using void_t = void; - template<typename T> struct is_span_ : std::false_type { }; template<typename T, size_t E> @@ -65,16 +36,19 @@ namespace detail_ { constexpr bool has_size_and_data = false; template<typename T> constexpr bool has_size_and_data<T, - void_t<decltype(al::size(std::declval<T>())), decltype(al::data(std::declval<T>()))>> + std::void_t<decltype(std::size(std::declval<T>())),decltype(std::data(std::declval<T>()))>> = true; + template<typename C> + constexpr bool is_valid_container_type = !is_span_v<C> && !is_std_array_v<C> + && !std::is_array<C>::value && has_size_and_data<C>; + template<typename T, typename U> constexpr bool is_array_compatible = std::is_convertible<T(*)[],U(*)[]>::value; template<typename C, typename T> - constexpr bool is_valid_container = !is_span_v<C> && !is_std_array_v<C> - && !std::is_array<C>::value && has_size_and_data<C> - && is_array_compatible<std::remove_pointer_t<decltype(al::data(std::declval<C&>()))>,T>; + constexpr bool is_valid_container = is_valid_container_type<C> + && is_array_compatible<std::remove_pointer_t<decltype(std::data(std::declval<C&>()))>,T>; } // namespace detail_ #define REQUIRES(...) std::enable_if_t<(__VA_ARGS__),bool> = true @@ -102,30 +76,33 @@ public: template<bool is0=(extent == 0), REQUIRES(is0)> constexpr span() noexcept { } template<typename U> - constexpr explicit span(U iter, index_type) : mData{to_address(iter)} { } + constexpr explicit span(U iter, index_type) : mData{::al::to_address(iter)} { } template<typename U, typename V, REQUIRES(!std::is_convertible<V,size_t>::value)> - constexpr explicit span(U first, V) : mData{to_address(first)} { } + constexpr explicit span(U first, V) : mData{::al::to_address(first)} + {} constexpr span(type_identity_t<element_type> (&arr)[E]) noexcept - : span{al::data(arr), al::size(arr)} + : span{std::data(arr), std::size(arr)} + { } + constexpr span(std::array<value_type,E> &arr) noexcept + : span{std::data(arr), std::size(arr)} { } - constexpr span(std::array<value_type,E> &arr) noexcept : span{al::data(arr), al::size(arr)} { } template<typename U=T, REQUIRES(std::is_const<U>::value)> constexpr span(const std::array<value_type,E> &arr) noexcept - : span{al::data(arr), al::size(arr)} + : span{std::data(arr), std::size(arr)} { } template<typename U, REQUIRES(detail_::is_valid_container<U, element_type>)> - constexpr explicit span(U&& cont) : span{al::data(cont), al::size(cont)} { } + constexpr explicit span(U&& cont) : span{std::data(cont), std::size(cont)} { } template<typename U, index_type N, REQUIRES(!std::is_same<element_type,U>::value && detail_::is_array_compatible<U,element_type> && N == dynamic_extent)> constexpr explicit span(const span<U,N> &span_) noexcept - : span{al::data(span_), al::size(span_)} + : span{std::data(span_), std::size(span_)} { } template<typename U, index_type N, REQUIRES(!std::is_same<element_type,U>::value && detail_::is_array_compatible<U,element_type> && N == extent)> - constexpr span(const span<U,N> &span_) noexcept : span{al::data(span_), al::size(span_)} { } + constexpr span(const span<U,N> &span_) noexcept : span{std::data(span_), std::size(span_)} { } constexpr span(const span&) noexcept = default; constexpr span& operator=(const span &rhs) noexcept = default; @@ -215,30 +192,31 @@ public: constexpr span() noexcept = default; template<typename U> - constexpr span(U iter, index_type count) - : mData{to_address(iter)}, mDataEnd{to_address(iter)+count} + constexpr span(U iter, index_type count) : mData{::al::to_address(iter)}, mDataEnd{::al::to_address(iter) + count} { } template<typename U, typename V, REQUIRES(!std::is_convertible<V,size_t>::value)> - constexpr span(U first, V last) : span{to_address(first), static_cast<size_t>(last-first)} + constexpr span(U first, V last) : span{::al::to_address(first), static_cast<size_t>(last - first)} { } template<size_t N> constexpr span(type_identity_t<element_type> (&arr)[N]) noexcept - : span{al::data(arr), al::size(arr)} + : span{std::data(arr), std::size(arr)} { } template<size_t N> - constexpr span(std::array<value_type,N> &arr) noexcept : span{al::data(arr), al::size(arr)} { } + constexpr span(std::array<value_type,N> &arr) noexcept + : span{std::data(arr), std::size(arr)} + { } template<size_t N, typename U=T, REQUIRES(std::is_const<U>::value)> constexpr span(const std::array<value_type,N> &arr) noexcept - : span{al::data(arr), al::size(arr)} + : span{std::data(arr), std::size(arr)} { } template<typename U, REQUIRES(detail_::is_valid_container<U, element_type>)> - constexpr span(U&& cont) : span{al::data(cont), al::size(cont)} { } + constexpr span(U&& cont) : span{std::data(cont), std::size(cont)} { } template<typename U, size_t N, REQUIRES((!std::is_same<element_type,U>::value || extent != N) && detail_::is_array_compatible<U,element_type>)> - constexpr span(const span<U,N> &span_) noexcept : span{al::data(span_), al::size(span_)} { } + constexpr span(const span<U,N> &span_) noexcept : span{std::data(span_), std::size(span_)} { } constexpr span(const span&) noexcept = default; constexpr span& operator=(const span &rhs) noexcept = default; @@ -322,30 +300,21 @@ constexpr inline auto span<T,E>::subspan(size_t offset, size_t count) const span<element_type>{mData+offset, mData+offset+count}; } -/* Helpers to deal with the lack of user-defined deduction guides (C++17). */ -template<typename T, typename U> -constexpr auto as_span(T ptr, U count_or_end) -{ - using value_type = typename std::pointer_traits<T>::element_type; - return span<value_type>{ptr, count_or_end}; -} -template<typename T, size_t N> -constexpr auto as_span(T (&arr)[N]) noexcept { return span<T,N>{al::data(arr), al::size(arr)}; } -template<typename T, size_t N> -constexpr auto as_span(std::array<T,N> &arr) noexcept -{ return span<T,N>{al::data(arr), al::size(arr)}; } -template<typename T, size_t N> -constexpr auto as_span(const std::array<T,N> &arr) noexcept -{ return span<std::add_const_t<T>,N>{al::data(arr), al::size(arr)}; } -template<typename U, REQUIRES(!detail_::is_span_v<U> && !detail_::is_std_array_v<U> - && !std::is_array<U>::value && detail_::has_size_and_data<U>)> -constexpr auto as_span(U&& cont) -{ - using value_type = std::remove_pointer_t<decltype(al::data(std::declval<U&>()))>; - return span<value_type>{al::data(cont), al::size(cont)}; -} -template<typename T, size_t N> -constexpr auto as_span(span<T,N> span_) noexcept { return span_; } + +template<typename T, typename EndOrSize> +span(T, EndOrSize) -> span<std::remove_reference_t<decltype(*std::declval<T&>())>>; + +template<typename T, std::size_t N> +span(T (&)[N]) -> span<T, N>; + +template<typename T, std::size_t N> +span(std::array<T, N>&) -> span<T, N>; + +template<typename T, std::size_t N> +span(const std::array<T, N>&) -> span<const T, N>; + +template<typename C, REQUIRES(detail_::is_valid_container_type<C>)> +span(C&&) -> span<std::remove_pointer_t<decltype(std::data(std::declval<C&>()))>>; #undef REQUIRES diff --git a/common/althrd_setname.cpp b/common/althrd_setname.cpp new file mode 100644 index 00000000..22d33092 --- /dev/null +++ b/common/althrd_setname.cpp @@ -0,0 +1,76 @@ + +#include "config.h" + +#include "althrd_setname.h" + + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +void althrd_setname(const char *name [[maybe_unused]]) +{ +#if defined(_MSC_VER) && !defined(_M_ARM) + +#define MS_VC_EXCEPTION 0x406D1388 +#pragma pack(push,8) + struct { + DWORD dwType; // Must be 0x1000. + LPCSTR szName; // Pointer to name (in user addr space). + DWORD dwThreadID; // Thread ID (-1=caller thread). + DWORD dwFlags; // Reserved for future use, must be zero. + } info; +#pragma pack(pop) + info.dwType = 0x1000; + info.szName = name; + info.dwThreadID = ~DWORD{0}; + info.dwFlags = 0; + + /* FIXME: How to do this on MinGW? */ + __try { + RaiseException(MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); + } + __except(EXCEPTION_CONTINUE_EXECUTION) { + } +#undef MS_VC_EXCEPTION +#endif +} + +#else + +#include <pthread.h> +#ifdef HAVE_PTHREAD_NP_H +#include <pthread_np.h> +#endif + +namespace { + +using setname_t1 = int(*)(const char*); +using setname_t2 = int(*)(pthread_t, const char*); +using setname_t3 = void(*)(pthread_t, const char*); +using setname_t4 = int(*)(pthread_t, const char*, void*); + +[[maybe_unused]] void setname_caller(setname_t1 func, const char *name) +{ func(name); } + +[[maybe_unused]] void setname_caller(setname_t2 func, const char *name) +{ func(pthread_self(), name); } + +[[maybe_unused]] void setname_caller(setname_t3 func, const char *name) +{ func(pthread_self(), name); } + +[[maybe_unused]] void setname_caller(setname_t4 func, const char *name) +{ func(pthread_self(), "%s", static_cast<void*>(const_cast<char*>(name))); } + +} // namespace + +void althrd_setname(const char *name [[maybe_unused]]) +{ +#if defined(HAVE_PTHREAD_SET_NAME_NP) + setname_caller(pthread_set_name_np, name); +#elif defined(HAVE_PTHREAD_SETNAME_NP) + setname_caller(pthread_setname_np, name); +#endif +} + +#endif diff --git a/common/althrd_setname.h b/common/althrd_setname.h new file mode 100644 index 00000000..0e22c0a9 --- /dev/null +++ b/common/althrd_setname.h @@ -0,0 +1,6 @@ +#ifndef COMMON_ALTHRD_SETNAME_H +#define COMMON_ALTHRD_SETNAME_H + +void althrd_setname(const char *name); + +#endif /* COMMON_ALTHRD_SETNAME_H */ diff --git a/common/comptr.h b/common/comptr.h index cdc6dec0..9f8fd294 100644 --- a/common/comptr.h +++ b/common/comptr.h @@ -2,49 +2,84 @@ #define COMMON_COMPTR_H #include <cstddef> +#include <memory> +#include <type_traits> #include <utility> +#include <variant> -#include "opthelpers.h" +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <objbase.h> + +struct ComWrapper { + HRESULT mStatus{}; + + ComWrapper(void *reserved, DWORD coinit) + : mStatus{CoInitializeEx(reserved, coinit)} + { } + ComWrapper(DWORD coinit=COINIT_APARTMENTTHREADED) + : mStatus{CoInitializeEx(nullptr, coinit)} + { } + ComWrapper(ComWrapper&& rhs) { mStatus = std::exchange(rhs.mStatus, E_FAIL); } + ComWrapper(const ComWrapper&) = delete; + ~ComWrapper() { if(SUCCEEDED(mStatus)) CoUninitialize(); } + + ComWrapper& operator=(ComWrapper&& rhs) + { + if(SUCCEEDED(mStatus)) + CoUninitialize(); + mStatus = std::exchange(rhs.mStatus, E_FAIL); + return *this; + } + ComWrapper& operator=(const ComWrapper&) = delete; + + HRESULT status() const noexcept { return mStatus; } + explicit operator bool() const noexcept { return SUCCEEDED(status()); } + + void uninit() + { + if(SUCCEEDED(mStatus)) + CoUninitialize(); + mStatus = E_FAIL; + } +}; template<typename T> -class ComPtr { - T *mPtr{nullptr}; +struct ComPtr { + using element_type = T; + + static constexpr bool RefIsNoexcept{noexcept(std::declval<T&>().AddRef()) + && noexcept(std::declval<T&>().Release())}; -public: ComPtr() noexcept = default; - ComPtr(const ComPtr &rhs) : mPtr{rhs.mPtr} { if(mPtr) mPtr->AddRef(); } + ComPtr(const ComPtr &rhs) noexcept(RefIsNoexcept) : mPtr{rhs.mPtr} + { if(mPtr) mPtr->AddRef(); } ComPtr(ComPtr&& rhs) noexcept : mPtr{rhs.mPtr} { rhs.mPtr = nullptr; } ComPtr(std::nullptr_t) noexcept { } explicit ComPtr(T *ptr) noexcept : mPtr{ptr} { } ~ComPtr() { if(mPtr) mPtr->Release(); } - ComPtr& operator=(const ComPtr &rhs) + ComPtr& operator=(const ComPtr &rhs) noexcept(RefIsNoexcept) { - if(!rhs.mPtr) + if constexpr(RefIsNoexcept) { - if(mPtr) - mPtr->Release(); - mPtr = nullptr; + if(rhs.mPtr) rhs.mPtr->AddRef(); + if(mPtr) mPtr->Release(); + mPtr = rhs.mPtr; + return *this; } else { - rhs.mPtr->AddRef(); - try { - if(mPtr) - mPtr->Release(); - mPtr = rhs.mPtr; - } - catch(...) { - rhs.mPtr->Release(); - throw; - } + ComPtr tmp{rhs}; + if(mPtr) mPtr->Release(); + mPtr = tmp.release(); + return *this; } - return *this; } - ComPtr& operator=(ComPtr&& rhs) + ComPtr& operator=(ComPtr&& rhs) noexcept(RefIsNoexcept) { - if(&rhs != this) LIKELY + if(&rhs != this) { if(mPtr) mPtr->Release(); mPtr = std::exchange(rhs.mPtr, nullptr); @@ -52,17 +87,63 @@ public: return *this; } + void reset(T *ptr=nullptr) noexcept(RefIsNoexcept) + { + if(mPtr) mPtr->Release(); + mPtr = ptr; + } + explicit operator bool() const noexcept { return mPtr != nullptr; } T& operator*() const noexcept { return *mPtr; } T* operator->() const noexcept { return mPtr; } T* get() const noexcept { return mPtr; } - T** getPtr() noexcept { return &mPtr; } T* release() noexcept { return std::exchange(mPtr, nullptr); } void swap(ComPtr &rhs) noexcept { std::swap(mPtr, rhs.mPtr); } void swap(ComPtr&& rhs) noexcept { std::swap(mPtr, rhs.mPtr); } + +private: + T *mPtr{nullptr}; +}; + + +namespace al { + +template<typename SP, typename PT, typename ...Args> +class out_ptr_t { + static_assert(!std::is_same_v<PT,void*>); + + SP &mRes; + std::variant<PT,void*> mPtr{}; + +public: + out_ptr_t(SP &res) : mRes{res} { } + ~out_ptr_t() + { + auto set_res = [this](auto &ptr) + { mRes.reset(static_cast<PT>(ptr)); }; + std::visit(set_res, mPtr); + } + out_ptr_t(const out_ptr_t&) = delete; + + out_ptr_t& operator=(const out_ptr_t&) = delete; + + operator PT*() noexcept + { return &std::get<PT>(mPtr); } + + operator void**() noexcept + { return &mPtr.template emplace<void*>(); } }; +template<typename T=void, typename SP, typename ...Args> +auto out_ptr(SP &res) +{ + using ptype = typename SP::element_type*; + return out_ptr_t<SP,ptype>{res}; +} + +} // namespace al + #endif diff --git a/common/dynload.cpp b/common/dynload.cpp index f1c2a7eb..86c36e00 100644 --- a/common/dynload.cpp +++ b/common/dynload.cpp @@ -3,6 +3,7 @@ #include "dynload.h" +#include "albit.h" #include "strutils.h" #ifdef _WIN32 @@ -17,7 +18,7 @@ void *LoadLib(const char *name) void CloseLib(void *handle) { FreeLibrary(static_cast<HMODULE>(handle)); } void *GetSymbol(void *handle, const char *name) -{ return reinterpret_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name)); } +{ return al::bit_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name)); } #elif defined(HAVE_DLFCN_H) diff --git a/common/opthelpers.h b/common/opthelpers.h index 596c2455..dc43ccdb 100644 --- a/common/opthelpers.h +++ b/common/opthelpers.h @@ -19,10 +19,13 @@ #ifdef __GNUC__ #define force_inline [[gnu::always_inline]] inline +#define NOINLINE [[gnu::noinline]] #elif defined(_MSC_VER) #define force_inline __forceinline +#define NOINLINE __declspec(noinline) #else #define force_inline inline +#define NOINLINE #endif /* Unlike the likely attribute, ASSUME requires the condition to be true or diff --git a/common/pffft.cpp b/common/pffft.cpp new file mode 100644 index 00000000..71f71fa6 --- /dev/null +++ b/common/pffft.cpp @@ -0,0 +1,2259 @@ +//$ nobt + +/* Copyright (c) 2013 Julien Pommier ( [email protected] ) + * Copyright (c) 2023 Christopher Robinson + * + * Based on original fortran 77 code from FFTPACKv4 from NETLIB + * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + * of NCAR, in 1985. + * + * As confirmed by the NCAR fftpack software curators, the following + * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + * released under the same terms. + * + * FFTPACK license: + * + * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + * + * Copyright (c) 2004 the University Corporation for Atmospheric + * Research ("UCAR"). All rights reserved. Developed by NCAR's + * Computational and Information Systems Laboratory, UCAR, + * www.cisl.ucar.edu. + * + * Redistribution and use of the Software in source and binary forms, + * with or without modification, is permitted provided that the + * following conditions are met: + * + * - Neither the names of NCAR's Computational and Information Systems + * Laboratory, the University Corporation for Atmospheric Research, + * nor the names of its sponsors or contributors may be used to + * endorse or promote products derived from this Software without + * specific prior written permission. + * + * - Redistributions of source code must retain the above copyright + * notices, this list of conditions, and the disclaimer below. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the disclaimer below in the + * documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + * SOFTWARE. + * + * + * PFFFT : a Pretty Fast FFT. + * + * This file is largerly based on the original FFTPACK implementation, modified + * in order to take advantage of SIMD instructions of modern CPUs. + */ + +#include "pffft.h" + +#include <array> +#include <assert.h> +#include <cmath> +#include <cstring> +#include <stdio.h> +#include <stdlib.h> +#include <vector> + +#include "albit.h" +#include "almalloc.h" +#include "alnumbers.h" +#include "alspan.h" +#include "opthelpers.h" + + +namespace { + +using uint = unsigned int; + + +/* Vector support macros: the rest of the code is independent of + * SSE/Altivec/NEON -- adding support for other platforms with 4-element + * vectors should be limited to these macros + */ + +/* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */ +//#define PFFFT_SIMD_DISABLE + +#ifndef PFFFT_SIMD_DISABLE +/* + * Altivec support macros + */ +#if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__) +typedef vector float v4sf; +#define SIMD_SZ 4 +#define VZERO() ((vector float) vec_splat_u8(0)) +#define VMUL(a,b) vec_madd(a,b, VZERO()) +#define VADD vec_add +#define VMADD vec_madd +#define VSUB vec_sub +#define LD_PS1 vec_splats +force_inline v4sf vset4(float a, float b, float c, float d) noexcept +{ + /* There a more efficient way to do this? */ + alignas(16) std::array<float,4> vals{{a, b, c, d}}; + return vec_ld(0, vals.data()); +} +#define VSET4 vset4 +#define VINSERT0(v, a) vec_insert((a), (v), 0) +#define VEXTRACT0(v) vec_extract((v), 0) + +force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{vec_mergeh(in1, in2)}; + out2 = vec_mergel(in1, in2); + out1 = tmp; +} +force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))}; + out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31)); + out1 = tmp; +} + +force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept +{ + v4sf y0{vec_mergeh(x0, x2)}; + v4sf y1{vec_mergel(x0, x2)}; + v4sf y2{vec_mergeh(x1, x3)}; + v4sf y3{vec_mergel(x1, x3)}; + x0 = vec_mergeh(y0, y2); + x1 = vec_mergel(y0, y2); + x2 = vec_mergeh(y1, y3); + x3 = vec_mergel(y1, y3); +} + +#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) + +/* + * SSE1 support macros + */ +#elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + +#include <xmmintrin.h> +typedef __m128 v4sf; +#define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. +#define VZERO _mm_setzero_ps +#define VMUL _mm_mul_ps +#define VADD _mm_add_ps +#define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) +#define VSUB _mm_sub_ps +#define LD_PS1 _mm_set1_ps +#define VSET4 _mm_setr_ps +#define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a)) +#define VEXTRACT0 _mm_cvtss_f32 + +force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{_mm_unpacklo_ps(in1, in2)}; + out2 = _mm_unpackhi_ps(in1, in2); + out1 = tmp; +} +force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))}; + out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); + out1 = tmp; +} + +force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept +{ _MM_TRANSPOSE4_PS(x0, x1, x2, x3); } + +#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) + +/* + * ARM NEON support macros + */ +#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) + +#include <arm_neon.h> +typedef float32x4_t v4sf; +#define SIMD_SZ 4 +#define VZERO() vdupq_n_f32(0) +#define VMUL vmulq_f32 +#define VADD vaddq_f32 +#define VMADD(a,b,c) vmlaq_f32(c,a,b) +#define VSUB vsubq_f32 +#define LD_PS1 vdupq_n_f32 +force_inline v4sf vset4(float a, float b, float c, float d) noexcept +{ + float32x4_t ret{vmovq_n_f32(a)}; + ret = vsetq_lane_f32(b, ret, 1); + ret = vsetq_lane_f32(c, ret, 2); + ret = vsetq_lane_f32(d, ret, 3); + return ret; +} +#define VSET4 vset4 +#define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0) +#define VEXTRACT0(v) vgetq_lane_f32((v), 0) + +force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + float32x4x2_t tmp{vzipq_f32(in1, in2)}; + out1 = tmp.val[0]; + out2 = tmp.val[1]; +} +force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + float32x4x2_t tmp{vuzpq_f32(in1, in2)}; + out1 = tmp.val[0]; + out2 = tmp.val[1]; +} + +force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept +{ + /* marginally faster version: + * asm("vtrn.32 %q0, %q1;\n" + * "vtrn.32 %q2, %q3\n + * "vswp %f0, %e2\n + * "vswp %f1, %e3" + * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); + */ + float32x4x2_t t0_{vzipq_f32(x0, x2)}; + float32x4x2_t t1_{vzipq_f32(x1, x3)}; + float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])}; + float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])}; + x0 = u0_.val[0]; + x1 = u0_.val[1]; + x2 = u1_.val[0]; + x3 = u1_.val[1]; +} + +#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) + +/* + * Generic GCC vector macros + */ +#elif defined(__GNUC__) + +using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float; +#define SIMD_SZ 4 +#define VZERO() v4sf{0,0,0,0} +#define VMUL(a,b) ((a) * (b)) +#define VADD(a,b) ((a) + (b)) +#define VMADD(a,b,c) ((a)*(b) + (c)) +#define VSUB(a,b) ((a) - (b)) + +constexpr force_inline v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; } +#define LD_PS1 ld_ps1 +#define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)} +constexpr force_inline v4sf vinsert0(v4sf v, float a) noexcept +{ return v4sf{a, v[1], v[2], v[3]}; } +#define VINSERT0 vinsert0 +#define VEXTRACT0(v) ((v)[0]) + +force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept +{ return v4sf{a[0], b[0], a[1], b[1]}; } +force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept +{ return v4sf{a[2], b[2], a[3], b[3]}; } + +force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{unpacklo(in1, in2)}; + out2 = unpackhi(in1, in2); + out1 = tmp; +} +force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp{in1[0], in1[2], in2[0], in2[2]}; + out2 = v4sf{in1[1], in1[3], in2[1], in2[3]}; + out1 = tmp; +} + +force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept +{ + v4sf tmp0{unpacklo(x0, x1)}; + v4sf tmp2{unpacklo(x2, x3)}; + v4sf tmp1{unpackhi(x0, x1)}; + v4sf tmp3{unpackhi(x2, x3)}; + x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]}; + x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]}; + x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]}; + x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]}; +} + +force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept +{ return v4sf{b[0], b[1], a[2], a[3]}; } +#define VSWAPHL vswaphl + +#else + +#warning "building with simd disabled !\n"; +#define PFFFT_SIMD_DISABLE // fallback to scalar code +#endif + +#endif /* PFFFT_SIMD_DISABLE */ + +// fallback mode for situations where SIMD is not available, use scalar mode instead +#ifdef PFFFT_SIMD_DISABLE +typedef float v4sf; +#define SIMD_SZ 1 +#define VZERO() 0.f +#define VMUL(a,b) ((a)*(b)) +#define VADD(a,b) ((a)+(b)) +#define VMADD(a,b,c) ((a)*(b)+(c)) +#define VSUB(a,b) ((a)-(b)) +#define LD_PS1(p) (p) +#endif + +inline bool valigned(const float *ptr) noexcept +{ + static constexpr uintptr_t alignmask{SIMD_SZ*4 - 1}; + return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0; +} + +// shortcuts for complex multiplications +force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept +{ + v4sf tmp{VMUL(ar, bi)}; + ar = VSUB(VMUL(ar, br), VMUL(ai, bi)); + ai = VMADD(ai, br, tmp); +} +force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept +{ + v4sf tmp{VMUL(ar, bi)}; + ar = VMADD(ai, bi, VMUL(ar, br)); + ai = VSUB(VMUL(ai, br), tmp); +} + +#if !defined(PFFFT_SIMD_DISABLE) + +#define assertv4(v,f0,f1,f2,f3) assert(v##_f[0] == (f0) && v##_f[1] == (f1) && v##_f[2] == (f2) && v##_f[3] == (f3)) + +/* detect bugs with the vector support macros */ +[[maybe_unused]] void validate_pffft_simd() +{ + using float4 = std::array<float,4>; + static constexpr float f[16]{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + + float4 a0_f, a1_f, a2_f, a3_f, t_f, u_f; + v4sf a0_v, a1_v, a2_v, a3_v, t_v, u_v; + std::memcpy(&a0_v, f, 4*sizeof(float)); + std::memcpy(&a1_v, f+4, 4*sizeof(float)); + std::memcpy(&a2_v, f+8, 4*sizeof(float)); + std::memcpy(&a3_v, f+12, 4*sizeof(float)); + + t_v = VZERO(); t_f = al::bit_cast<float4>(t_v); + printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0); + t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v); + printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 12, 14, 16, 18); + t_v = VMUL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v); + printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 45, 60, 77); + t_v = VMADD(a1_v, a2_v,a0_v); t_f = al::bit_cast<float4>(t_v); + printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 46, 62, 80); + + interleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v); + printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]); + assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); + uninterleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v); + printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]); + assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); + + t_v=LD_PS1(f[15]); t_f = al::bit_cast<float4>(t_v); + printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); + assertv4(t, 15, 15, 15, 15); + t_v = VSWAPHL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v); + printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); + assertv4(t, 8, 9, 6, 7); + vtranspose4(a0_v, a1_v, a2_v, a3_v); + a0_f = al::bit_cast<float4>(a0_v); + a1_f = al::bit_cast<float4>(a1_v); + a2_f = al::bit_cast<float4>(a2_v); + a3_f = al::bit_cast<float4>(a3_v); + printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", + a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3], + a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]); + assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); +} +#endif //!PFFFT_SIMD_DISABLE + +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... + +/* + passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 +*/ +NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch, + const float *wa1, const float fsign) +{ + const size_t l1ido{l1*ido}; + if(ido <= 2) + { + for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido) + { + ch[0] = VADD(cc[0], cc[ido+0]); + ch[l1ido] = VSUB(cc[0], cc[ido+0]); + ch[1] = VADD(cc[1], cc[ido+1]); + ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); + } + } + else + { + for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido) + { + for(size_t i{0};i < ido-1;i += 2) + { + v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])}; + v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])}; + v4sf wr{LD_PS1(wa1[i])}, wi{LD_PS1(wa1[i+1]*fsign)}; + ch[i] = VADD(cc[i+0], cc[i+ido+0]); + ch[i+1] = VADD(cc[i+1], cc[i+ido+1]); + vcplxmul(tr2, ti2, wr, wi); + ch[i+l1ido] = tr2; + ch[i+l1ido+1] = ti2; + } + } + } +} + +/* + passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3 +*/ +NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float fsign) +{ + assert(ido > 2); + + const v4sf taur{LD_PS1(-0.5f)}; + const v4sf taui{LD_PS1(0.866025403784439f*fsign)}; + const size_t l1ido{l1*ido}; + for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido) + { + for(size_t i{0};i < ido-1;i += 2) + { + v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])}; + v4sf cr2{VMADD(taur, tr2, cc[i])}; + ch[i] = VADD(tr2, cc[i]); + v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])}; + v4sf ci2{VMADD(taur, ti2, cc[i+1])}; + ch[i+1] = VADD(cc[i+1], ti2); + v4sf cr3{VMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]))}; + v4sf ci3{VMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))}; + v4sf dr2{VSUB(cr2, ci3)}; + v4sf dr3{VADD(cr2, ci3)}; + v4sf di2{VADD(ci2, cr3)}; + v4sf di3{VSUB(ci2, cr3)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; + vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch[i+l1ido] = dr2; + ch[i+l1ido + 1] = di2; + vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch[i+2*l1ido] = dr3; + ch[i+2*l1ido+1] = di3; + } + } +} /* passf3 */ + +NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float fsign) +{ + /* fsign == -1 for forward transform and +1 for backward transform */ + const v4sf vsign{LD_PS1(fsign)}; + const size_t l1ido{l1*ido}; + if(ido == 2) + { + for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido) + { + v4sf tr1{VSUB(cc[0], cc[2*ido + 0])}; + v4sf tr2{VADD(cc[0], cc[2*ido + 0])}; + v4sf ti1{VSUB(cc[1], cc[2*ido + 1])}; + v4sf ti2{VADD(cc[1], cc[2*ido + 1])}; + v4sf ti4{VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), vsign)}; + v4sf tr4{VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), vsign)}; + v4sf tr3{VADD(cc[ido + 0], cc[3*ido + 0])}; + v4sf ti3{VADD(cc[ido + 1], cc[3*ido + 1])}; + + ch[0*l1ido + 0] = VADD(tr2, tr3); + ch[0*l1ido + 1] = VADD(ti2, ti3); + ch[1*l1ido + 0] = VADD(tr1, tr4); + ch[1*l1ido + 1] = VADD(ti1, ti4); + ch[2*l1ido + 0] = VSUB(tr2, tr3); + ch[2*l1ido + 1] = VSUB(ti2, ti3); + ch[3*l1ido + 0] = VSUB(tr1, tr4); + ch[3*l1ido + 1] = VSUB(ti1, ti4); + } + } + else + { + for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido) + { + for(size_t i{0};i < ido-1;i+=2) + { + v4sf tr1{VSUB(cc[i + 0], cc[i + 2*ido + 0])}; + v4sf tr2{VADD(cc[i + 0], cc[i + 2*ido + 0])}; + v4sf ti1{VSUB(cc[i + 1], cc[i + 2*ido + 1])}; + v4sf ti2{VADD(cc[i + 1], cc[i + 2*ido + 1])}; + v4sf tr4{VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)}; + v4sf ti4{VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)}; + v4sf tr3{VADD(cc[i + ido + 0], cc[i + 3*ido + 0])}; + v4sf ti3{VADD(cc[i + ido + 1], cc[i + 3*ido + 1])}; + + ch[i] = VADD(tr2, tr3); + v4sf cr3{VSUB(tr2, tr3)}; + ch[i + 1] = VADD(ti2, ti3); + v4sf ci3{VSUB(ti2, ti3)}; + + v4sf cr2{VADD(tr1, tr4)}; + v4sf cr4{VSUB(tr1, tr4)}; + v4sf ci2{VADD(ti1, ti4)}; + v4sf ci4{VSUB(ti1, ti4)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}; + vcplxmul(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1)); + float wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; + ch[i + l1ido] = cr2; + ch[i + l1ido + 1] = ci2; + + vcplxmul(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2)); + float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}; + ch[i + 2*l1ido] = cr3; + ch[i + 2*l1ido + 1] = ci3; + + vcplxmul(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3)); + ch[i + 3*l1ido] = cr4; + ch[i + 3*l1ido + 1] = ci4; + } + } + } +} /* passf4 */ + +/* + * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5 + */ +NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign) +{ + const v4sf tr11{LD_PS1(0.309016994374947f)}; + const v4sf tr12{LD_PS1(-0.809016994374947f)}; + const v4sf ti11{LD_PS1(0.951056516295154f*fsign)}; + const v4sf ti12{LD_PS1(0.587785252292473f*fsign)}; + +#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1] +#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1] + + assert(ido > 2); + for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido) + { + for(size_t i{0};i < ido-1;i += 2) + { + v4sf ti5{VSUB(cc_ref(i , 2), cc_ref(i , 5))}; + v4sf ti2{VADD(cc_ref(i , 2), cc_ref(i , 5))}; + v4sf ti4{VSUB(cc_ref(i , 3), cc_ref(i , 4))}; + v4sf ti3{VADD(cc_ref(i , 3), cc_ref(i , 4))}; + v4sf tr5{VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5))}; + v4sf tr2{VADD(cc_ref(i-1, 2), cc_ref(i-1, 5))}; + v4sf tr4{VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4))}; + v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))}; + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + v4sf cr2{VADD(cc_ref(i-1, 1), VMADD(tr11, tr2, VMUL(tr12, tr3)))}; + v4sf ci2{VADD(cc_ref(i , 1), VMADD(tr11, ti2, VMUL(tr12, ti3)))}; + v4sf cr3{VADD(cc_ref(i-1, 1), VMADD(tr12, tr2, VMUL(tr11, tr3)))}; + v4sf ci3{VADD(cc_ref(i , 1), VMADD(tr12, ti2, VMUL(tr11, ti3)))}; + v4sf cr5{VMADD(ti11, tr5, VMUL(ti12, tr4))}; + v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))}; + v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))}; + v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))}; + v4sf dr3{VSUB(cr3, ci4)}; + v4sf dr4{VADD(cr3, ci4)}; + v4sf di3{VADD(ci3, cr4)}; + v4sf di4{VSUB(ci3, cr4)}; + v4sf dr5{VADD(cr2, ci5)}; + v4sf dr2{VSUB(cr2, ci5)}; + v4sf di5{VSUB(ci2, cr5)}; + v4sf di2{VADD(ci2, cr5)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; + float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]}; + vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + vcplxmul(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + vcplxmul(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} + +NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, + v4sf *RESTRICT ch, const float *wa1) +{ + const size_t l1ido{l1*ido}; + for(size_t k{0};k < l1ido;k += ido) + { + v4sf a{cc[k]}, b{cc[k + l1ido]}; + ch[2*k] = VADD(a, b); + ch[2*(k+ido)-1] = VSUB(a, b); + } + if(ido < 2) + return; + if(ido != 2) + { + for(size_t k{0};k < l1ido;k += ido) + { + for(size_t i{2};i < ido;i += 2) + { + v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]}; + v4sf br{cc[i - 1 + k]}, bi{cc[i + k]}; + vcplxmulconj(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); + ch[i + 2*k] = VADD(bi, ti2); + ch[2*(k+ido) - i] = VSUB(ti2, bi); + ch[i - 1 + 2*k] = VADD(br, tr2); + ch[2*(k+ido) - i -1] = VSUB(br, tr2); + } + } + if((ido&1) == 1) + return; + } + const v4sf minus_one{LD_PS1(-1.0f)}; + for(size_t k{0};k < l1ido;k += ido) + { + ch[2*k + ido] = VMUL(minus_one, cc[ido-1 + k + l1ido]); + ch[2*k + ido-1] = cc[k + ido-1]; + } +} /* radf2 */ + + +NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch, + const float *wa1) +{ + const size_t l1ido{l1*ido}; + for(size_t k{0};k < l1ido;k += ido) + { + v4sf a{cc[2*k]}; + v4sf b{cc[2*(k+ido) - 1]}; + ch[k] = VADD(a, b); + ch[k + l1ido] = VSUB(a, b); + } + if(ido < 2) + return; + if(ido != 2) + { + for(size_t k{0};k < l1ido;k += ido) + { + for(size_t i{2};i < ido;i += 2) + { + v4sf a{cc[i-1 + 2*k]}; + v4sf b{cc[2*(k + ido) - i - 1]}; + v4sf c{cc[i+0 + 2*k]}; + v4sf d{cc[2*(k + ido) - i + 0]}; + ch[i-1 + k] = VADD(a, b); + v4sf tr2{VSUB(a, b)}; + ch[i+0 + k] = VSUB(c, d); + v4sf ti2{VADD(c, d)}; + vcplxmul(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); + ch[i-1 + k + l1ido] = tr2; + ch[i+0 + k + l1ido] = ti2; + } + } + if((ido&1) == 1) + return; + } + const v4sf minus_two{LD_PS1(-2.0f)}; + for(size_t k{0};k < l1ido;k += ido) + { + v4sf a{cc[2*k + ido-1]}; + v4sf b{cc[2*k + ido]}; + ch[k + ido-1] = VADD(a,a); + ch[k + ido-1 + l1ido] = VMUL(minus_two, b); + } +} /* radb2 */ + +void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2) +{ + const v4sf taur{LD_PS1(-0.5f)}; + const v4sf taui{LD_PS1(0.866025403784439f)}; + for(size_t k{0};k < l1;++k) + { + v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])}; + ch[ (3*k )*ido] = VADD(cc[k*ido], cr2); + ch[ (3*k + 2)*ido] = VMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido])); + ch[ido-1 + (3*k + 1)*ido] = VMADD(taur, cr2, cc[k*ido]); + } + if(ido == 1) + return; + for(size_t k{0};k < l1;++k) + { + for(size_t i{2};i < ido;i += 2) + { + const size_t ic{ido - i}; + v4sf wr1{LD_PS1(wa1[i - 2])}; + v4sf wi1{LD_PS1(wa1[i - 1])}; + v4sf dr2{cc[i - 1 + (k + l1)*ido]}; + v4sf di2{cc[i + (k + l1)*ido]}; + vcplxmulconj(dr2, di2, wr1, wi1); + + v4sf wr2{LD_PS1(wa2[i - 2])}; + v4sf wi2{LD_PS1(wa2[i - 1])}; + v4sf dr3{cc[i - 1 + (k + l1*2)*ido]}; + v4sf di3{cc[i + (k + l1*2)*ido]}; + vcplxmulconj(dr3, di3, wr2, wi2); + + v4sf cr2{VADD(dr2, dr3)}; + v4sf ci2{VADD(di2, di3)}; + ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2); + ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2); + v4sf tr2{VMADD(taur, cr2, cc[i - 1 + k*ido])}; + v4sf ti2{VMADD(taur, ci2, cc[i + k*ido])}; + v4sf tr3{VMUL(taui, VSUB(di2, di3))}; + v4sf ti3{VMUL(taui, VSUB(dr3, dr2))}; + ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3); + ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3); + ch[i + (3*k + 2)*ido] = VADD(ti2, ti3); + ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2); + } + } +} /* radf3 */ + + +void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2) +{ + static constexpr float taur{-0.5f}; + static constexpr float taui{0.866025403784439f}; + static constexpr float taui_2{taui*2.0f}; + + const v4sf vtaur{LD_PS1(taur)}; + const v4sf vtaui_2{LD_PS1(taui_2)}; + for(size_t k{0};k < l1;++k) + { + v4sf tr2 = cc[ido-1 + (3*k + 1)*ido]; + tr2 = VADD(tr2,tr2); + v4sf cr2 = VMADD(vtaur, tr2, cc[3*k*ido]); + ch[k*ido] = VADD(cc[3*k*ido], tr2); + v4sf ci3 = VMUL(vtaui_2, cc[(3*k + 2)*ido]); + ch[(k + l1)*ido] = VSUB(cr2, ci3); + ch[(k + 2*l1)*ido] = VADD(cr2, ci3); + } + if(ido == 1) + return; + const v4sf vtaui{LD_PS1(taui)}; + for(size_t k{0};k < l1;++k) + { + for(size_t i{2};i < ido;i += 2) + { + const size_t ic{ido - i}; + v4sf tr2{VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])}; + v4sf cr2{VMADD(vtaur, tr2, cc[i - 1 + 3*k*ido])}; + ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2); + v4sf ti2{VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])}; + v4sf ci2{VMADD(vtaur, ti2, cc[i + 3*k*ido])}; + ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2); + v4sf cr3{VMUL(vtaui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))}; + v4sf ci3{VMUL(vtaui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))}; + v4sf dr2{VSUB(cr2, ci3)}; + v4sf dr3{VADD(cr2, ci3)}; + v4sf di2{VADD(ci2, cr3)}; + v4sf di3{VSUB(ci2, cr3)}; + vcplxmul(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); + ch[i - 1 + (k + l1)*ido] = dr2; + ch[i + (k + l1)*ido] = di2; + vcplxmul(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1])); + ch[i - 1 + (k + 2*l1)*ido] = dr3; + ch[i + (k + 2*l1)*ido] = di3; + } + } +} /* radb3 */ + +NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, + v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2, + const float *RESTRICT wa3) +{ + const size_t l1ido{l1*ido}; + { + const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido}; + v4sf *RESTRICT ch_{ch}; + while(cc != cc_end) + { + // this loop represents between 25% and 40% of total radf4_ps cost ! + v4sf a0{cc[0]}, a1{cc[l1ido]}; + v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]}; + v4sf tr1{VADD(a1, a3)}; + v4sf tr2{VADD(a0, a2)}; + ch[2*ido-1] = VSUB(a0, a2); + ch[2*ido ] = VSUB(a3, a1); + ch[0 ] = VADD(tr1, tr2); + ch[4*ido-1] = VSUB(tr2, tr1); + cc += ido; ch += 4*ido; + } + cc = cc_; + ch = ch_; + } + if(ido < 2) + return; + if(ido != 2) + { + for(size_t k{0};k < l1ido;k += ido) + { + const v4sf *RESTRICT pc{cc + 1 + k}; + for(size_t i{2};i < ido;i += 2, pc += 2) + { + const size_t ic{ido - i}; + + v4sf cr2{pc[1*l1ido+0]}; + v4sf ci2{pc[1*l1ido+1]}; + v4sf wr{LD_PS1(wa1[i - 2])}; + v4sf wi{LD_PS1(wa1[i - 1])}; + vcplxmulconj(cr2,ci2,wr,wi); + + v4sf cr3{pc[2*l1ido+0]}; + v4sf ci3{pc[2*l1ido+1]}; + wr = LD_PS1(wa2[i-2]); + wi = LD_PS1(wa2[i-1]); + vcplxmulconj(cr3, ci3, wr, wi); + + v4sf cr4{pc[3*l1ido]}; + v4sf ci4{pc[3*l1ido+1]}; + wr = LD_PS1(wa3[i-2]); + wi = LD_PS1(wa3[i-1]); + vcplxmulconj(cr4, ci4, wr, wi); + + /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */ + + v4sf tr1{VADD(cr2,cr4)}; + v4sf tr4{VSUB(cr4,cr2)}; + v4sf tr2{VADD(pc[0],cr3)}; + v4sf tr3{VSUB(pc[0],cr3)}; + ch[i - 1 + 4*k ] = VADD(tr2,tr1); + ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed + v4sf ti1{VADD(ci2,ci4)}; + v4sf ti4{VSUB(ci2,ci4)}; + ch[i - 1 + 4*k + 2*ido] = VADD(tr3,ti4); + ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4 + v4sf ti2{VADD(pc[1],ci3)}; + v4sf ti3{VSUB(pc[1],ci3)}; + ch[i + 4*k ] = VADD(ti1, ti2); + ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2); + ch[i + 4*k + 2*ido] = VADD(tr4, ti3); + ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3); + } + } + if((ido&1) == 1) + return; + } + const v4sf minus_hsqt2{LD_PS1(al::numbers::sqrt2_v<float> * -0.5f)}; + for(size_t k{0};k < l1ido;k += ido) + { + v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]}; + v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]}; + v4sf ti1{VMUL(minus_hsqt2, VADD(b, a))}; + v4sf tr1{VMUL(minus_hsqt2, VSUB(b, a))}; + ch[ido-1 + 4*k ] = VADD(c, tr1); + ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1); + ch[ 4*k + 1*ido] = VSUB(ti1, d); + ch[ 4*k + 3*ido] = VADD(ti1, d); + } +} /* radf4 */ + + +NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, + v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2, + const float *RESTRICT wa3) +{ + const v4sf two{LD_PS1(2.0f)}; + const size_t l1ido{l1*ido}; + { + const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido}; + v4sf *ch_{ch}; + while(ch != ch_end) + { + v4sf a{cc[0]}, b{cc[4*ido-1]}; + v4sf c{cc[2*ido]}, d{cc[2*ido-1]}; + v4sf tr3{VMUL(two,d)}; + v4sf tr2{VADD(a,b)}; + v4sf tr1{VSUB(a,b)}; + v4sf tr4{VMUL(two,c)}; + ch[0*l1ido] = VADD(tr2, tr3); + ch[2*l1ido] = VSUB(tr2, tr3); + ch[1*l1ido] = VSUB(tr1, tr4); + ch[3*l1ido] = VADD(tr1, tr4); + + cc += 4*ido; ch += ido; + } + cc = cc_; ch = ch_; + } + if(ido < 2) + return; + if(ido != 2) + { + for(size_t k{0};k < l1ido;k += ido) + { + const v4sf *RESTRICT pc{cc - 1 + 4*k}; + v4sf *RESTRICT ph{ch + k + 1}; + for(size_t i{2};i < ido;i += 2) + { + v4sf tr1{VSUB(pc[ i], pc[4*ido - i])}; + v4sf tr2{VADD(pc[ i], pc[4*ido - i])}; + v4sf ti4{VSUB(pc[2*ido + i], pc[2*ido - i])}; + v4sf tr3{VADD(pc[2*ido + i], pc[2*ido - i])}; + ph[0] = VADD(tr2, tr3); + v4sf cr3{VSUB(tr2, tr3)}; + + v4sf ti3{VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1])}; + v4sf tr4{VADD(pc[2*ido + i + 1], pc[2*ido - i + 1])}; + v4sf cr2{VSUB(tr1, tr4)}; + v4sf cr4{VADD(tr1, tr4)}; + + v4sf ti1{VADD(pc[i + 1], pc[4*ido - i + 1])}; + v4sf ti2{VSUB(pc[i + 1], pc[4*ido - i + 1])}; + + ph[1] = VADD(ti2, ti3); ph += l1ido; + v4sf ci3{VSUB(ti2, ti3)}; + v4sf ci2{VADD(ti1, ti4)}; + v4sf ci4{VSUB(ti1, ti4)}; + vcplxmul(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); + ph[0] = cr2; + ph[1] = ci2; ph += l1ido; + vcplxmul(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1])); + ph[0] = cr3; + ph[1] = ci3; ph += l1ido; + vcplxmul(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1])); + ph[0] = cr4; + ph[1] = ci4; ph = ph - 3*l1ido + 2; + } + } + if((ido&1) == 1) + return; + } + const v4sf minus_sqrt2{LD_PS1(-1.414213562373095f)}; + for(size_t k{0};k < l1ido;k += ido) + { + const size_t i0{4*k + ido}; + v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]}; + v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]}; + v4sf tr1{VSUB(c,d)}; + v4sf tr2{VADD(c,d)}; + v4sf ti1{VADD(b,a)}; + v4sf ti2{VSUB(b,a)}; + ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2); + ch[ido-1 + k + 1*l1ido] = VMUL(minus_sqrt2, VSUB(ti1, tr1)); + ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2); + ch[ido-1 + k + 3*l1ido] = VMUL(minus_sqrt2, VADD(ti1, tr1)); + } +} /* radb4 */ + +void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4) +{ + const v4sf tr11{LD_PS1(0.309016994374947f)}; + const v4sf ti11{LD_PS1(0.951056516295154f)}; + const v4sf tr12{LD_PS1(-0.809016994374947f)}; + const v4sf ti12{LD_PS1(0.587785252292473f)}; + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch -= 1 + ido * 6; + cc -= 1 + ido * (1 + l1); + + /* Function Body */ + for(size_t k{1};k <= l1;++k) + { + v4sf cr2{VADD(cc_ref(1, k, 5), cc_ref(1, k, 2))}; + v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))}; + v4sf cr3{VADD(cc_ref(1, k, 4), cc_ref(1, k, 3))}; + v4sf ci4{VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3))}; + ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3)); + ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VMADD(tr11, cr2, VMUL(tr12, cr3))); + ch_ref(1, 3, k) = VMADD(ti11, ci5, VMUL(ti12, ci4)); + ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VMADD(tr12, cr2, VMUL(tr11, cr3))); + ch_ref(1, 5, k) = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4)); + //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); + } + if(ido == 1) + return; + + const size_t idp2{ido + 2}; + for(size_t k{1};k <= l1;++k) + { + for(size_t i{3};i <= ido;i += 2) + { + const size_t ic{idp2 - i}; + v4sf dr2{LD_PS1(wa1[i-3])}; + v4sf di2{LD_PS1(wa1[i-2])}; + v4sf dr3{LD_PS1(wa2[i-3])}; + v4sf di3{LD_PS1(wa2[i-2])}; + v4sf dr4{LD_PS1(wa3[i-3])}; + v4sf di4{LD_PS1(wa3[i-2])}; + v4sf dr5{LD_PS1(wa4[i-3])}; + v4sf di5{LD_PS1(wa4[i-2])}; + vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2)); + vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3)); + vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4)); + vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5)); + v4sf cr2{VADD(dr2, dr5)}; + v4sf ci5{VSUB(dr5, dr2)}; + v4sf cr5{VSUB(di2, di5)}; + v4sf ci2{VADD(di2, di5)}; + v4sf cr3{VADD(dr3, dr4)}; + v4sf ci4{VSUB(dr4, dr3)}; + v4sf cr4{VSUB(di3, di4)}; + v4sf ci3{VADD(di3, di4)}; + ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3)); + ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3)); + v4sf tr2{VADD(cc_ref(i - 1, k, 1), VMADD(tr11, cr2, VMUL(tr12, cr3)))}; + v4sf ti2{VSUB(cc_ref(i, k, 1), VMADD(tr11, ci2, VMUL(tr12, ci3)))}; + v4sf tr3{VADD(cc_ref(i - 1, k, 1), VMADD(tr12, cr2, VMUL(tr11, cr3)))}; + v4sf ti3{VSUB(cc_ref(i, k, 1), VMADD(tr12, ci2, VMUL(tr11, ci3)))}; + v4sf tr5{VMADD(ti11, cr5, VMUL(ti12, cr4))}; + v4sf ti5{VMADD(ti11, ci5, VMUL(ti12, ci4))}; + v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))}; + v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))}; + ch_ref(i - 1, 3, k) = VSUB(tr2, tr5); + ch_ref(ic - 1, 2, k) = VADD(tr2, tr5); + ch_ref(i , 3, k) = VADD(ti5, ti2); + ch_ref(ic , 2, k) = VSUB(ti5, ti2); + ch_ref(i - 1, 5, k) = VSUB(tr3, tr4); + ch_ref(ic - 1, 4, k) = VADD(tr3, tr4); + ch_ref(i , 5, k) = VADD(ti4, ti3); + ch_ref(ic , 4, k) = VSUB(ti4, ti3); + } + } +#undef cc_ref +#undef ch_ref +} /* radf5 */ + +void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4) +{ + const v4sf tr11{LD_PS1(0.309016994374947f)}; + const v4sf ti11{LD_PS1(0.951056516295154f)}; + const v4sf tr12{LD_PS1(-0.809016994374947f)}; + const v4sf ti12{LD_PS1(0.587785252292473f)}; + +#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1] +#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] + + /* Parameter adjustments */ + ch -= 1 + ido*(1 + l1); + cc -= 1 + ido*6; + + /* Function Body */ + for(size_t k{1};k <= l1;++k) + { + v4sf ti5{VADD(cc_ref( 1, 3, k), cc_ref(1, 3, k))}; + v4sf ti4{VADD(cc_ref( 1, 5, k), cc_ref(1, 5, k))}; + v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))}; + v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))}; + ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3)); + v4sf cr2{VADD(cc_ref(1, 1, k), VMADD(tr11, tr2, VMUL(tr12, tr3)))}; + v4sf cr3{VADD(cc_ref(1, 1, k), VMADD(tr12, tr2, VMUL(tr11, tr3)))}; + v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))}; + v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))}; + ch_ref(1, k, 2) = VSUB(cr2, ci5); + ch_ref(1, k, 3) = VSUB(cr3, ci4); + ch_ref(1, k, 4) = VADD(cr3, ci4); + ch_ref(1, k, 5) = VADD(cr2, ci5); + } + if(ido == 1) + return; + + const size_t idp2{ido + 2}; + for(size_t k{1};k <= l1;++k) + { + for(size_t i{3};i <= ido;i += 2) + { + const size_t ic{idp2 - i}; + v4sf ti5{VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k))}; + v4sf ti2{VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k))}; + v4sf ti4{VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k))}; + v4sf ti3{VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k))}; + v4sf tr5{VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))}; + v4sf tr2{VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))}; + v4sf tr4{VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))}; + v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))}; + ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3)); + ch_ref(i , k, 1) = VADD(cc_ref(i , 1, k), VADD(ti2, ti3)); + v4sf cr2{VADD(cc_ref(i-1, 1, k), VMADD(tr11, tr2, VMUL(tr12, tr3)))}; + v4sf ci2{VADD(cc_ref(i , 1, k), VMADD(tr11, ti2, VMUL(tr12, ti3)))}; + v4sf cr3{VADD(cc_ref(i-1, 1, k), VMADD(tr12, tr2, VMUL(tr11, tr3)))}; + v4sf ci3{VADD(cc_ref(i , 1, k), VMADD(tr12, ti2, VMUL(tr11, ti3)))}; + v4sf cr5{VMADD(ti11, tr5, VMUL(ti12, tr4))}; + v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))}; + v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))}; + v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))}; + v4sf dr3{VSUB(cr3, ci4)}; + v4sf dr4{VADD(cr3, ci4)}; + v4sf di3{VADD(ci3, cr4)}; + v4sf di4{VSUB(ci3, cr4)}; + v4sf dr5{VADD(cr2, ci5)}; + v4sf dr2{VSUB(cr2, ci5)}; + v4sf di5{VSUB(ci2, cr5)}; + v4sf di2{VADD(ci2, cr5)}; + vcplxmul(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2])); + vcplxmul(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2])); + vcplxmul(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2])); + vcplxmul(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2])); + + ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2; + ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3; + ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4; + ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5; + } + } +#undef cc_ref +#undef ch_ref +} /* radb5 */ + +NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, + const float *wa, const al::span<const uint,15> ifac) +{ + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const size_t nf{ifac[1]}; + size_t l2{n}; + size_t iw{n-1}; + for(size_t k1{1};k1 <= nf;++k1) + { + size_t kh{nf - k1}; + size_t ip{ifac[kh + 2]}; + size_t l1{l2 / ip}; + size_t ido{n / l2}; + iw -= (ip - 1)*ido; + switch(ip) + { + case 5: + { + size_t ix2{iw + ido}; + size_t ix3{ix2 + ido}; + size_t ix4{ix3 + ido}; + radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); + } + break; + case 4: + { + size_t ix2{iw + ido}; + size_t ix3{ix2 + ido}; + radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]); + } + break; + case 3: + { + size_t ix2{iw + ido}; + radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]); + } + break; + case 2: + radf2_ps(ido, l1, in, out, &wa[iw]); + break; + default: + assert(0); + break; + } + l2 = l1; + if(out == work2) + { + out = work1; + in = work2; + } + else + { + out = work2; + in = work1; + } + } + return const_cast<v4sf*>(in); /* this is in fact the output .. */ +} /* rfftf1 */ + +NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, + const float *wa, const al::span<const uint,15> ifac) +{ + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const size_t nf{ifac[1]}; + size_t l1{1}; + size_t iw{0}; + for(size_t k1{1};k1 <= nf;++k1) + { + size_t ip{ifac[k1 + 1]}; + size_t l2{ip*l1}; + size_t ido{n / l2}; + switch(ip) + { + case 5: + { + size_t ix2{iw + ido}; + size_t ix3{ix2 + ido}; + size_t ix4{ix3 + ido}; + radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); + } + break; + case 4: + { + size_t ix2{iw + ido}; + size_t ix3{ix2 + ido}; + radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]); + } + break; + case 3: + { + size_t ix2{iw + ido}; + radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]); + } + break; + case 2: + radb2_ps(ido, l1, in, out, &wa[iw]); + break; + default: + assert(0); + break; + } + l1 = l2; + iw += (ip - 1)*ido; + + if(out == work2) + { + out = work1; + in = work2; + } + else + { + out = work2; + in = work1; + } + } + return const_cast<v4sf*>(in); /* this is in fact the output .. */ +} + +v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, + const float *wa, const al::span<const uint,15> ifac, const float fsign) +{ + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const size_t nf{ifac[1]}; + size_t l1{1}, iw{0}; + for(size_t k1{2};k1 <= nf+1;++k1) + { + const size_t ip{ifac[k1]}; + const size_t l2{ip*l1}; + const size_t ido{n / l2}; + const size_t idot{ido + ido}; + switch(ip) + { + case 5: + { + size_t ix2{iw + idot}; + size_t ix3{ix2 + idot}; + size_t ix4{ix3 + idot}; + passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign); + } + break; + case 4: + { + size_t ix2{iw + idot}; + size_t ix3{ix2 + idot}; + passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign); + } + break; + case 3: + { + size_t ix2{iw + idot}; + passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign); + } + break; + case 2: + passf2_ps(idot, l1, in, out, &wa[iw], fsign); + break; + default: + assert(0); + } + l1 = l2; + iw += (ip - 1)*idot; + if(out == work2) + { + out = work1; + in = work2; + } + else + { + out = work2; + in = work1; + } + } + + return const_cast<v4sf*>(in); /* this is in fact the output .. */ +} + + +uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh) +{ + uint nl{n}, nf{0}; + for(const uint ntry : ntryh) + { + while(nl != 1) + { + const uint nq{nl / ntry}; + const uint nr{nl % ntry}; + if(nr != 0) break; + + ifac[2+nf++] = ntry; + nl = nq; + if(ntry == 2 && nf != 1) + { + for(size_t i{2};i <= nf;++i) + { + size_t ib{nf - i + 2}; + ifac[ib + 1] = ifac[ib]; + } + ifac[2] = 2; + } + } + } + ifac[0] = n; + ifac[1] = nf; + return nf; +} + +void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac) +{ + static constexpr uint ntryh[]{4,2,3,5}; + + const uint nf{decompose(n, ifac, ntryh)}; + const double argh{2.0*al::numbers::pi / n}; + size_t is{0}; + size_t nfm1{nf - 1}; + size_t l1{1}; + for(size_t k1{0};k1 < nfm1;++k1) + { + const size_t ip{ifac[k1+2]}; + const size_t l2{l1*ip}; + const size_t ido{n / l2}; + const size_t ipm{ip - 1}; + size_t ld{0}; + for(size_t j{0};j < ipm;++j) + { + size_t i{is}; + ld += l1; + const double argld{static_cast<double>(ld)*argh}; + double fi{0.0}; + for(size_t ii{2};ii < ido;ii += 2) + { + fi += 1.0; + wa[i++] = static_cast<float>(std::cos(fi*argld)); + wa[i++] = static_cast<float>(std::sin(fi*argld)); + } + is += ido; + } + l1 = l2; + } +} /* rffti1 */ + +void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac) +{ + static constexpr uint ntryh[]{5,3,4,2}; + + const uint nf{decompose(n, ifac, ntryh)}; + const double argh{2.0*al::numbers::pi / n}; + size_t i{1}; + size_t l1{1}; + for(size_t k1{0};k1 < nf;++k1) + { + const size_t ip{ifac[k1+2]}; + const size_t l2{l1*ip}; + const size_t ido{n / l2}; + const size_t idot{ido + ido + 2}; + const size_t ipm{ip - 1}; + size_t ld{0}; + for(size_t j{0};j < ipm;++j) + { + size_t i1{i}; + wa[i-1] = 1; + wa[i] = 0; + ld += l1; + const double argld{static_cast<double>(ld)*argh}; + double fi{0.0}; + for(size_t ii{3};ii < idot;ii += 2) + { + fi += 1.0; + wa[++i] = static_cast<float>(std::cos(fi*argld)); + wa[++i] = static_cast<float>(std::sin(fi*argld)); + } + if(ip > 5) + { + wa[i1-1] = wa[i-1]; + wa[i1] = wa[i]; + } + } + l1 = l2; + } +} /* cffti1 */ + +} // namespace + +void *pffft_aligned_malloc(size_t nb_bytes) +{ return al_malloc(MALLOC_V4SF_ALIGNMENT, nb_bytes); } + +void pffft_aligned_free(void *p) { al_free(p); } + +int pffft_simd_size() { return SIMD_SZ; } + +struct PFFFT_Setup { + uint N; + uint Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) + std::array<uint,15> ifac; + pffft_transform_t transform; + + float *twiddle; // N/4 elements + alignas(MALLOC_V4SF_ALIGNMENT) v4sf e[1]; // N/4*3 elements +}; + +PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform) +{ + assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX); + assert(N > 0); + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + * and 32 for real FFTs -- a lot of stuff would need to be rewritten to + * handle other cases (or maybe just switch to a scalar fft, I don't know..) + */ + if(transform == PFFFT_REAL) + assert((N%(2*SIMD_SZ*SIMD_SZ)) == 0); + else + assert((N%(SIMD_SZ*SIMD_SZ)) == 0); + + const uint Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + const size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))}; + + void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)}; + if(!store) return nullptr; + + PFFFT_Setup *s{::new(store) PFFFT_Setup{}}; + s->N = N; + s->transform = transform; + /* nb of complex simd vectors */ + s->Ncvec = Ncvec; + s->twiddle = reinterpret_cast<float*>(&s->e[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]); + + if constexpr(SIMD_SZ > 1) + { + auto e = std::vector<float>(2u*Ncvec*(SIMD_SZ-1), 0.0f); + for(size_t k{0};k < s->Ncvec;++k) + { + const size_t i{k / SIMD_SZ}; + const size_t j{k % SIMD_SZ}; + for(size_t m{0};m < SIMD_SZ-1;++m) + { + const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N}; + e[((i*3 + m)*2 + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A)); + e[((i*3 + m)*2 + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A)); + } + } + std::memcpy(s->e, e.data(), e.size()*sizeof(float)); + } + if(transform == PFFFT_REAL) + rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + else + cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + + /* check that N is decomposable with allowed prime factors */ + size_t m{1}; + for(size_t k{0};k < s->ifac[1];++k) + m *= s->ifac[2+k]; + + if(m != N/SIMD_SZ) + { + pffft_destroy_setup(s); + s = nullptr; + } + + return s; +} + + +void pffft_destroy_setup(PFFFT_Setup *s) +{ + std::destroy_at(s); + al_free(s); +} + +#if !defined(PFFFT_SIMD_DISABLE) + +namespace { + +/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ +void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out) +{ + v4sf g0, g1; + interleave2(in[0], in[1], g0, g1); + in += in_stride; + + *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] + for(size_t k{1};k < N;++k) + { + v4sf h0, h1; + interleave2(in[0], in[1], h0, h1); + in += in_stride; + *--out = VSWAPHL(g1, h0); + *--out = VSWAPHL(h0, h1); + g1 = h1; + } + *--out = VSWAPHL(g1, g0); +} + +void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride) +{ + v4sf g0{in[0]}, g1{g0}; + ++in; + for(size_t k{1};k < N;++k) + { + v4sf h0{*in++}; v4sf h1{*in++}; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + uninterleave2(h0, g1, out[0], out[1]); + out += out_stride; + g1 = h1; + } + v4sf h0{*in++}, h1{g0}; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + uninterleave2(h0, g1, out[0], out[1]); +} + +void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e) +{ + assert(in != out); + + const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks + for(size_t k{0};k < dk;++k) + { + v4sf r0{in[8*k+0]}, i0{in[8*k+1]}; + v4sf r1{in[8*k+2]}, i1{in[8*k+3]}; + v4sf r2{in[8*k+4]}, i2{in[8*k+5]}; + v4sf r3{in[8*k+6]}, i3{in[8*k+7]}; + vtranspose4(r0,r1,r2,r3); + vtranspose4(i0,i1,i2,i3); + vcplxmul(r1,i1,e[k*6+0],e[k*6+1]); + vcplxmul(r2,i2,e[k*6+2],e[k*6+3]); + vcplxmul(r3,i3,e[k*6+4],e[k*6+5]); + + v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)}; + v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)}; + v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)}; + v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)}; + + /* transformation for each column is: + * + * [1 1 1 1 0 0 0 0] [r0] + * [1 0 -1 0 0 -1 0 1] [r1] + * [1 -1 1 -1 0 0 0 0] [r2] + * [1 0 -1 0 0 1 0 -1] [r3] + * [0 0 0 0 1 1 1 1] * [i0] + * [0 1 0 -1 1 0 -1 0] [i1] + * [0 0 0 0 1 -1 1 -1] [i2] + * [0 -1 0 1 1 0 -1 0] [i3] + */ + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + +void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e) +{ + assert(in != out); + + const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks + for(size_t k{0};k < dk;++k) + { + v4sf r0{in[8*k+0]}, i0{in[8*k+1]}; + v4sf r1{in[8*k+2]}, i1{in[8*k+3]}; + v4sf r2{in[8*k+4]}, i2{in[8*k+5]}; + v4sf r3{in[8*k+6]}, i3{in[8*k+7]}; + + v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)}; + v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)}; + v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)}; + v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)}; + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); + + vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]); + vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]); + vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]); + + vtranspose4(r0,r1,r2,r3); + vtranspose4(i0,i1,i2,i3); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + + +force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, + const v4sf *e, v4sf *RESTRICT out) +{ + v4sf r0{*in0}, i0{*in1}; + v4sf r1{*in++}; v4sf i1{*in++}; + v4sf r2{*in++}; v4sf i2{*in++}; + v4sf r3{*in++}; v4sf i3{*in++}; + vtranspose4(r0,r1,r2,r3); + vtranspose4(i0,i1,i2,i3); + + /* transformation for each column is: + * + * [1 1 1 1 0 0 0 0] [r0] + * [1 0 -1 0 0 -1 0 1] [r1] + * [1 0 -1 0 0 1 0 -1] [r2] + * [1 -1 1 -1 0 0 0 0] [r3] + * [0 0 0 0 1 1 1 1] * [i0] + * [0 -1 0 1 -1 0 1 0] [i1] + * [0 -1 0 1 1 0 -1 0] [i2] + * [0 0 0 0 -1 1 -1 1] [i3] + */ + + //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + vcplxmul(r1,i1,e[0],e[1]); + vcplxmul(r2,i2,e[2],e[3]); + vcplxmul(r3,i3,e[4],e[5]); + + //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0,r2)}; + v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r3,r1)}; + v4sf si0{VADD(i0,i2)}, di0{VSUB(i0,i2)}; + v4sf si1{VADD(i1,i3)}, di1{VSUB(i3,i1)}; + + r0 = VADD(sr0, sr1); + r3 = VSUB(sr0, sr1); + i0 = VADD(si0, si1); + i3 = VSUB(si1, si0); + r1 = VADD(dr0, di1); + r2 = VSUB(dr0, di1); + i1 = VSUB(dr1, di0); + i2 = VADD(dr1, di0); + + *out++ = r0; + *out++ = i0; + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; +} + +NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, + const v4sf *e) +{ + static constexpr float s{al::numbers::sqrt2_v<float>/2.0f}; + + assert(in != out); + const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + const v4sf zero{VZERO()}; + const auto cr = al::bit_cast<std::array<float,SIMD_SZ>>(in[0]); + const auto ci = al::bit_cast<std::array<float,SIMD_SZ>>(in[Ncvec*2-1]); + pffft_real_finalize_4x4(&zero, &zero, in+1, e, out); + + /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] + * + * [Xr(1)] ] [1 1 1 1 0 0 0 0] + * [Xr(N/4) ] [0 0 0 0 1 s 0 -s] + * [Xr(N/2) ] [1 0 -1 0 0 0 0 0] + * [Xr(3N/4)] [0 0 0 0 1 -s 0 s] + * [Xi(1) ] [1 -1 1 -1 0 0 0 0] + * [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] + * [Xi(N/2) ] [0 -1 0 1 0 0 0 0] + * [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] + */ + + const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = VINSERT0(out[0], xr0); + const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = VINSERT0(out[1], xi0); + const float xr2{(cr[0]-cr[2])}; out[4] = VINSERT0(out[4], xr2); + const float xi2{(cr[3]-cr[1])}; out[5] = VINSERT0(out[5], xi2); + const float xr1{ ci[0] + s*(ci[1]-ci[3])}; out[2] = VINSERT0(out[2], xr1); + const float xi1{-ci[2] - s*(ci[1]+ci[3])}; out[3] = VINSERT0(out[3], xi1); + const float xr3{ ci[0] - s*(ci[1]-ci[3])}; out[6] = VINSERT0(out[6], xr3); + const float xi3{ ci[2] - s*(ci[1]+ci[3])}; out[7] = VINSERT0(out[7], xi3); + + for(size_t k{1};k < dk;++k) + pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8); +} + +force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out, + const bool first) +{ + v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]}; + v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]}; + + /* transformation for each column is: + * + * [1 1 1 1 0 0 0 0] [r0] + * [1 0 0 -1 0 -1 -1 0] [r1] + * [1 -1 -1 1 0 0 0 0] [r2] + * [1 0 0 -1 0 1 1 0] [r3] + * [0 0 0 0 1 -1 1 -1] * [i0] + * [0 -1 1 0 1 0 0 1] [i1] + * [0 0 0 0 1 1 -1 -1] [i2] + * [0 1 -1 0 1 0 0 1] [i3] + */ + + v4sf sr0{VADD(r0,r3)}, dr0{VSUB(r0,r3)}; + v4sf sr1{VADD(r1,r2)}, dr1{VSUB(r1,r2)}; + v4sf si0{VADD(i0,i3)}, di0{VSUB(i0,i3)}; + v4sf si1{VADD(i1,i2)}, di1{VSUB(i1,i2)}; + + r0 = VADD(sr0, sr1); + r2 = VSUB(sr0, sr1); + r1 = VSUB(dr0, si1); + r3 = VADD(dr0, si1); + i0 = VSUB(di0, di1); + i2 = VADD(di0, di1); + i1 = VSUB(si0, dr1); + i3 = VADD(si0, dr1); + + vcplxmulconj(r1,i1,e[0],e[1]); + vcplxmulconj(r2,i2,e[2],e[3]); + vcplxmulconj(r3,i3,e[4],e[5]); + + vtranspose4(r0,r1,r2,r3); + vtranspose4(i0,i1,i2,i3); + + if(!first) + { + *out++ = r0; + *out++ = i0; + } + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; +} + +NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, + const v4sf *e) +{ + static constexpr float sqrt2{al::numbers::sqrt2_v<float>}; + + assert(in != out); + const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + std::array<float,SIMD_SZ> Xr, Xi; + for(size_t k{0};k < SIMD_SZ;++k) + { + Xr[k] = VEXTRACT0(in[2*k]); + Xi[k] = VEXTRACT0(in[2*k + 1]); + } + + pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values + + /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] + * + * [cr0] [1 0 2 0 1 0 0 0] + * [cr1] [1 0 0 0 -1 0 -2 0] + * [cr2] [1 0 -2 0 1 0 0 0] + * [cr3] [1 0 0 0 -1 0 2 0] + * [ci0] [0 2 0 2 0 0 0 0] + * [ci1] [0 s 0 -s 0 -s 0 -s] + * [ci2] [0 0 0 0 0 -2 0 2] + * [ci3] [0 -s 0 s 0 -s 0 -s] + */ + for(size_t k{1};k < dk;++k) + pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false); + + const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; + const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; + const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; + const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; + out[0] = VSET4(cr0, cr1, cr2, cr3); + const float ci0{ 2*(Xr[1]+Xr[3])}; + const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])}; + const float ci2{ 2*(Xi[3]-Xi[1])}; + const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])}; + out[2*Ncvec-1] = VSET4(ci0, ci1, ci2, ci3); +} + + +void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput, + v4sf *scratch, const pffft_direction_t direction, const bool ordered) +{ + assert(scratch != nullptr); + assert(voutput != scratch); + + const size_t Ncvec{setup->Ncvec}; + const bool nf_odd{(setup->ifac[1]&1) != 0}; + + v4sf *buff[2]{voutput, scratch}; + bool ib{nf_odd != ordered}; + if(direction == PFFFT_FORWARD) + { + /* Swap the initial work buffer for forward FFTs, which helps avoid an + * extra copy for output. + */ + ib = !ib; + if(setup->transform == PFFFT_REAL) + { + ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]); + pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e); + } + else + { + v4sf *tmp{buff[ib]}; + for(size_t k=0; k < Ncvec; ++k) + uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); + + ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]); + pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e); + } + if(ordered) + pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]), + reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD); + else + ib = !ib; + } + else + { + if(vinput == buff[ib]) + ib = !ib; // may happen when finput == foutput + + if(ordered) + { + pffft_zreorder(setup, reinterpret_cast<const float*>(vinput), + reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD); + vinput = buff[ib]; + ib = !ib; + } + if(setup->transform == PFFFT_REAL) + { + pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e); + ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]); + } + else + { + pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e); + ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac, +1.0f) == buff[1]); + for(size_t k{0};k < Ncvec;++k) + interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); + } + } + + if(buff[ib] != voutput) + { + /* extra copy required -- this situation should only happen when finput == foutput */ + assert(vinput==voutput); + for(size_t k{0};k < Ncvec;++k) + { + v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]}; + voutput[2*k] = a; voutput[2*k+1] = b; + } + } +} + +} // namespace + +void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out, + pffft_direction_t direction) +{ + assert(in != out); + + const size_t N{setup->N}, Ncvec{setup->Ncvec}; + const v4sf *vin{reinterpret_cast<const v4sf*>(in)}; + v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)}; + if(setup->transform == PFFFT_REAL) + { + const size_t dk{N/32}; + if(direction == PFFFT_FORWARD) + { + for(size_t k{0};k < dk;++k) + { + interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); + interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); + } + reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2); + reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ); + } + else + { + for(size_t k{0};k < dk;++k) + { + uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); + uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); + } + unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8); + unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8); + } + } + else + { + if(direction == PFFFT_FORWARD) + { + for(size_t k{0};k < Ncvec;++k) + { + size_t kk{(k/4) + (k%4)*(Ncvec/4)}; + interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); + } + } + else + { + for(size_t k{0};k < Ncvec;++k) + { + size_t kk{(k/4) + (k%4)*(Ncvec/4)}; + uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); + } + } + } +} + +void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b, + float *ab, float scaling) +{ + const size_t Ncvec{s->Ncvec}; + const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)}; + const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)}; + v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)}; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +#ifndef __clang__ +#define ZCONVOLVE_USING_INLINE_NEON_ASM +#endif +#endif + + const float ar1{VEXTRACT0(va[0])}; + const float ai1{VEXTRACT0(va[1])}; + const float br1{VEXTRACT0(vb[0])}; + const float bi1{VEXTRACT0(vb[1])}; + const float abr1{VEXTRACT0(vab[0])}; + const float abi1{VEXTRACT0(vab[1])}; + +#ifdef ZCONVOLVE_USING_INLINE_ASM + /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on + * Ubuntu. So this will be restricted to GCC. + * + * Does it still miscompile with Clang? Is it even needed with today's + * optimizers? + */ + const float *a_{a}, *b_{b}; float *ab_{ab}; + size_t N{Ncvec}; + asm volatile("mov r8, %2 \n" + "vdup.f32 q15, %4 \n" + "1: \n" + "pld [%0,#64] \n" + "pld [%1,#64] \n" + "pld [%2,#64] \n" + "pld [%0,#96] \n" + "pld [%1,#96] \n" + "pld [%2,#96] \n" + "vld1.f32 {q0,q1}, [%0,:128]! \n" + "vld1.f32 {q4,q5}, [%1,:128]! \n" + "vld1.f32 {q2,q3}, [%0,:128]! \n" + "vld1.f32 {q6,q7}, [%1,:128]! \n" + "vld1.f32 {q8,q9}, [r8,:128]! \n" + + "vmul.f32 q10, q0, q4 \n" + "vmul.f32 q11, q0, q5 \n" + "vmul.f32 q12, q2, q6 \n" + "vmul.f32 q13, q2, q7 \n" + "vmls.f32 q10, q1, q5 \n" + "vmla.f32 q11, q1, q4 \n" + "vld1.f32 {q0,q1}, [r8,:128]! \n" + "vmls.f32 q12, q3, q7 \n" + "vmla.f32 q13, q3, q6 \n" + "vmla.f32 q8, q10, q15 \n" + "vmla.f32 q9, q11, q15 \n" + "vmla.f32 q0, q12, q15 \n" + "vmla.f32 q1, q13, q15 \n" + "vst1.f32 {q8,q9},[%2,:128]! \n" + "vst1.f32 {q0,q1},[%2,:128]! \n" + "subs %3, #2 \n" + "bne 1b \n" + : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); + +#else + + /* Default routine, works fine for non-arm cpus with current compilers. */ + const v4sf vscal{LD_PS1(scaling)}; + for(size_t i{0};i < Ncvec;i += 2) + { + v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]}; + v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]}; + vcplxmul(ar4, ai4, br4, bi4); + vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]); + vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]); + ar4 = va[2*i+2]; ai4 = va[2*i+3]; + br4 = vb[2*i+2]; bi4 = vb[2*i+3]; + vcplxmul(ar4, ai4, br4, bi4); + vab[2*i+2] = VMADD(ar4, vscal, vab[2*i+2]); + vab[2*i+3] = VMADD(ai4, vscal, vab[2*i+3]); + } +#endif + + if(s->transform == PFFFT_REAL) + { + vab[0] = VINSERT0(vab[0], abr1 + ar1*br1*scaling); + vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1*scaling); + } +} + +void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab) +{ + const size_t Ncvec{s->Ncvec}; + const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)}; + const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)}; + v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)}; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +#endif + + const float ar1{VEXTRACT0(va[0])}; + const float ai1{VEXTRACT0(va[1])}; + const float br1{VEXTRACT0(vb[0])}; + const float bi1{VEXTRACT0(vb[1])}; + const float abr1{VEXTRACT0(vab[0])}; + const float abi1{VEXTRACT0(vab[1])}; + + /* No inline assembly for this version. I'm not familiar enough with NEON + * assembly, and I don't know that it's needed with today's optimizers. + */ + for(size_t i{0};i < Ncvec;i += 2) + { + v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]}; + v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]}; + vcplxmul(ar4, ai4, br4, bi4); + vab[2*i+0] = VADD(ar4, vab[2*i+0]); + vab[2*i+1] = VADD(ai4, vab[2*i+1]); + ar4 = va[2*i+2]; ai4 = va[2*i+3]; + br4 = vb[2*i+2]; bi4 = vb[2*i+3]; + vcplxmul(ar4, ai4, br4, bi4); + vab[2*i+2] = VADD(ar4, vab[2*i+2]); + vab[2*i+3] = VADD(ai4, vab[2*i+3]); + } + + if(s->transform == PFFFT_REAL) + { + vab[0] = VINSERT0(vab[0], abr1 + ar1*br1); + vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1); + } +} + + +void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work, + pffft_direction_t direction) +{ + assert(valigned(input) && valigned(output) && valigned(work)); + pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false); +} + +void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output, + float *work, pffft_direction_t direction) +{ + assert(valigned(input) && valigned(output) && valigned(work)); + pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true); +} + +#else // defined(PFFFT_SIMD_DISABLE) + +// standard routine using scalar floats, without SIMD stuff. + +namespace { + +#define pffft_transform_internal_nosimd pffft_transform_internal +void pffft_transform_internal_nosimd(const PFFFT_Setup *setup, const float *input, float *output, + float *scratch, const pffft_direction_t direction, bool ordered) +{ + const size_t Ncvec{setup->Ncvec}; + const bool nf_odd{(setup->ifac[1]&1) != 0}; + + assert(scratch != nullptr); + + /* z-domain data for complex transforms is already ordered without SIMD. */ + if(setup->transform == PFFFT_COMPLEX) + ordered = false; + + float *buff[2]{output, scratch}; + bool ib{nf_odd != ordered}; + if(direction == PFFFT_FORWARD) + { + if(setup->transform == PFFFT_REAL) + ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]); + else + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]); + if(ordered) + { + pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); + ib = !ib; + } + } + else + { + if (input == buff[ib]) + ib = !ib; // may happen when finput == foutput + + if(ordered) + { + pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD); + input = buff[ib]; + ib = !ib; + } + if(setup->transform == PFFFT_REAL) + ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]); + else + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]); + } + if(buff[ib] != output) + { + // extra copy required -- this situation should happens only when finput == foutput + assert(input==output); + for(size_t k{0};k < Ncvec;++k) + { + float a{buff[ib][2*k]}, b{buff[ib][2*k+1]}; + output[2*k] = a; output[2*k+1] = b; + } + } +} + +} // namespace + +#define pffft_zreorder_nosimd pffft_zreorder +void pffft_zreorder_nosimd(const PFFFT_Setup *setup, const float *in, float *RESTRICT out, + pffft_direction_t direction) +{ + const size_t N{setup->N}; + if(setup->transform == PFFFT_COMPLEX) + { + for(size_t k{0};k < 2*N;++k) + out[k] = in[k]; + return; + } + else if(direction == PFFFT_FORWARD) + { + float x_N{in[N-1]}; + for(size_t k{N-1};k > 1;--k) + out[k] = in[k-1]; + out[0] = in[0]; + out[1] = x_N; + } + else + { + float x_N{in[1]}; + for(size_t k{1};k < N-1;++k) + out[k] = in[k+1]; + out[0] = in[0]; + out[N-1] = x_N; + } +} + +void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b, + float *ab, float scaling) +{ + size_t Ncvec{s->Ncvec}; + + if(s->transform == PFFFT_REAL) + { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]*scaling; + ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling; + ++ab; ++a; ++b; --Ncvec; + } + for(size_t i{0};i < Ncvec;++i) + { + float ar{a[2*i+0]}, ai{a[2*i+1]}; + const float br{b[2*i+0]}, bi{b[2*i+1]}; + vcplxmul(ar, ai, br, bi); + ab[2*i+0] += ar*scaling; + ab[2*i+1] += ai*scaling; + } +} + +void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab) +{ + size_t Ncvec{s->Ncvec}; + + if(s->transform == PFFFT_REAL) + { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]; + ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]; + ++ab; ++a; ++b; --Ncvec; + } + for(size_t i{0};i < Ncvec;++i) + { + float ar{a[2*i+0]}, ai{a[2*i+1]}; + const float br{b[2*i+0]}, bi{b[2*i+1]}; + vcplxmul(ar, ai, br, bi); + ab[2*i+0] += ar; + ab[2*i+1] += ai; + } +} + + +void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work, + pffft_direction_t direction) +{ + pffft_transform_internal(setup, input, output, work, direction, false); +} + +void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output, + float *work, pffft_direction_t direction) +{ + pffft_transform_internal(setup, input, output, work, direction, true); +} + +#endif // defined(PFFFT_SIMD_DISABLE) diff --git a/common/pffft.h b/common/pffft.h new file mode 100644 index 00000000..9cff9e54 --- /dev/null +++ b/common/pffft.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2013 Julien Pommier ( [email protected] ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB, + authored by Dr Paul Swarztrauber of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +/* PFFFT : a Pretty Fast FFT. + * + * This is basically an adaptation of the single precision fftpack (v4) as + * found on netlib taking advantage of SIMD instructions found on CPUs such as + * Intel x86 (SSE1), PowerPC (Altivec), and Arm (NEON). + * + * For architectures where SIMD instructions aren't available, the code falls + * back to a scalar version. + * + * Restrictions: + * + * - 1D transforms only, with 32-bit single precision. + * + * - supports only transforms for inputs of length N of the form + * N=(2^a)*(3^b)*(5^c), given a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, 144, + * 160, etc are all acceptable lengths). Performance is best for 128<=N<=8192. + * + * - all (float*) pointers for the functions below are expected to have a + * "SIMD-compatible" alignment, that is 16 bytes. + * + * You can allocate such buffers with the pffft_aligned_malloc function, and + * deallocate them with pffft_aligned_free (or with stuff like posix_memalign, + * aligned_alloc, etc). + * + * Note that for the z-domain data of real transforms, when in the canonical + * order (as interleaved complex numbers) both 0-frequency and half-frequency + * components, which are real, are assembled in the first entry as + * F(0)+i*F(n/2+1). The original fftpack placed F(n/2+1) at the end of the + * arrays instead. + */ + +#ifndef PFFFT_H +#define PFFFT_H + +#include <stddef.h> // for size_t +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* opaque struct holding internal stuff (precomputed twiddle factors) this + * struct can be shared by many threads as it contains only read-only data. + */ +typedef struct PFFFT_Setup PFFFT_Setup; + +#ifndef PFFFT_COMMON_ENUMS +#define PFFFT_COMMON_ENUMS + +/* direction of the transform */ +typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; + +/* type of transform */ +typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; + +#endif + +/** + * Prepare for performing transforms of size N -- the returned PFFFT_Setup + * structure is read-only so it can safely be shared by multiple concurrent + * threads. + */ +PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform); +void pffft_destroy_setup(PFFFT_Setup *setup); + +/** + * Perform a Fourier transform. The z-domain data is stored in the most + * efficient order for transforming back or using for convolution, and as + * such, there's no guarantee to the order of the values. If you need to have + * its content sorted in the usual way, that is as an array of interleaved + * complex numbers, either use pffft_transform_ordered, or call pffft_zreorder + * after the forward fft and before the backward fft. + * + * Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. Typically + * you will want to scale the backward transform by 1/N. + * + * The 'work' pointer must point to an area of N (2*N for complex fft) floats, + * properly aligned. It cannot be NULL. + * + * The input and output parameters may alias. + */ +void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + +/** + * Similar to pffft_transform, but handles the complex values in the usual form + * (interleaved complex numbers). This is similar to calling + * pffft_transform(..., PFFFT_FORWARD) followed by + * pffft_zreorder(..., PFFFT_FORWARD), or + * pffft_zreorder(..., PFFFT_BACKWARD) followed by + * pffft_transform(..., PFFFT_BACKWARD), for the given direction. + * + * The input and output parameters may alias. + */ +void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + +/** + * Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal + * representation to the "canonical" order (as interleaved complex numbers). + * For PFFFT_BACKWARD, it reorders from the canonical order to the internal + * order suitable for pffft_transform(..., PFFFT_BACKWARD) or + * pffft_zconvolve_accumulate. + * + * The input and output parameters should not alias. + */ +void pffft_zreorder(const PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); + +/** + * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale + * and accumulate into dft_ab. The arrays should have been obtained with + * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD) + * and should *not* be in the usual order (otherwise just perform the operation + * yourself as the dft coeffs are stored as interleaved complex numbers). + * + * The operation performed is: dft_ab += (dft_a * dft_b)*scaling + * + * The dft_a, dft_b, and dft_ab parameters may alias. + */ +void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + +/** + * Perform a multiplication of the z-domain data in dft_a and dft_b, and + * accumulate into dft_ab. + * + * The operation performed is: dft_ab += dft_a * dft_b + * + * The dft_a, dft_b, and dft_ab parameters may alias. + */ +void pffft_zconvolve_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab); + +/** + * The float buffers must have the correct alignment (16-byte boundary on intel + * and powerpc). This function may be used to obtain such correctly aligned + * buffers. + */ +void *pffft_aligned_malloc(size_t nb_bytes); +void pffft_aligned_free(void *ptr); + +/* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */ +int pffft_simd_size(); + +#ifdef __cplusplus +} +#endif + +#endif // PFFFT_H diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 0d4166bc..e1a83dab 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -9,11 +9,14 @@ #include <array> #include <stddef.h> +#include <type_traits> #include "alcomplex.h" #include "alspan.h" +struct NoInit { }; + /* Implements a wide-band +90 degree phase-shift. Note that this should be * given one sample less of a delay (FilterSize/2 - 1) compared to the direct * signal delay (FilterSize/2) to properly align. @@ -53,14 +56,16 @@ struct PhaseShifterT { std::fill_n(fftBuffer.get(), fft_size, complex_d{}); fftBuffer[half_size] = 1.0; - forward_fft(al::as_span(fftBuffer.get(), fft_size)); - for(size_t i{0};i < half_size+1;++i) + forward_fft(al::span{fftBuffer.get(), fft_size}); + fftBuffer[0] *= std::numeric_limits<double>::epsilon(); + for(size_t i{1};i < half_size;++i) fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()}; + fftBuffer[half_size] *= std::numeric_limits<double>::epsilon(); for(size_t i{half_size+1};i < fft_size;++i) fftBuffer[i] = std::conj(fftBuffer[fft_size - i]); - inverse_fft(al::as_span(fftBuffer.get(), fft_size)); + inverse_fft(al::span{fftBuffer.get(), fft_size}); - auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1); + auto fftiter = fftBuffer.get() + fft_size - 1; for(float &coeff : mCoeffs) { coeff = static_cast<float>(fftiter->real() / double{fft_size}); @@ -68,29 +73,12 @@ struct PhaseShifterT { } } + PhaseShifterT(NoInit) { } + void process(al::span<float> dst, const float *RESTRICT src) const; private: #if defined(HAVE_NEON) - /* There doesn't seem to be NEON intrinsics to do this kind of stipple - * shuffling, so there's two custom methods for it. - */ - static auto shuffle_2020(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); - return ret; - } - static auto shuffle_3131(float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); - return ret; - } static auto unpacklo(float32x4_t a, float32x4_t b) { float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; @@ -171,9 +159,10 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])}; const float32x4_t s0{vld1q_f32(&src[j*2])}; const float32x4_t s1{vld1q_f32(&src[j*2 + 4])}; + const float32x4x2_t values{vuzpq_f32(s0, s1)}; - r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs); - r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs); + r04 = vmlaq_f32(r04, values.val[0], coeffs); + r14 = vmlaq_f32(r14, values.val[1], coeffs); } src += 2; diff --git a/common/polyphase_resampler.cpp b/common/polyphase_resampler.cpp index 14f7e40d..9c569b3a 100644 --- a/common/polyphase_resampler.cpp +++ b/common/polyphase_resampler.cpp @@ -3,6 +3,8 @@ #include <algorithm> #include <cmath> +#include <numeric> +#include <stdexcept> #include "alnumbers.h" #include "opthelpers.h" @@ -14,34 +16,36 @@ constexpr double Epsilon{1e-9}; using uint = unsigned int; -/* This is the normalized cardinal sine (sinc) function. - * - * sinc(x) = { 1, x = 0 - * { sin(pi x) / (pi x), otherwise. - */ -double Sinc(const double x) -{ - if(std::abs(x) < Epsilon) UNLIKELY - return 1.0; - return std::sin(al::numbers::pi*x) / (al::numbers::pi*x); -} +#if __cpp_lib_math_special_functions >= 201603L +using std::cyl_bessel_i; + +#else /* The zero-order modified Bessel function of the first kind, used for the * Kaiser window. * * I_0(x) = sum_{k=0}^inf (1 / k!)^2 (x / 2)^(2 k) * = sum_{k=0}^inf ((x / 2)^k / k!)^2 + * + * This implementation only handles nu = 0, and isn't the most precise (it + * starts with the largest value and accumulates successively smaller values, + * compounding the rounding and precision error), but it's good enough. */ -constexpr double BesselI_0(const double x) +template<typename T, typename U> +U cyl_bessel_i(T nu, U x) { - // Start at k=1 since k=0 is trivial. + if(nu != T{0}) + throw std::runtime_error{"cyl_bessel_i: nu != 0"}; + + /* Start at k=1 since k=0 is trivial. */ const double x2{x/2.0}; double term{1.0}; double sum{1.0}; int k{1}; - // Let the integration converge until the term of the sum is no longer - // significant. + /* Let the integration converge until the term of the sum is no longer + * significant. + */ double last_sum{}; do { const double y{x2 / k}; @@ -50,7 +54,20 @@ constexpr double BesselI_0(const double x) term *= y * y; sum += term; } while(sum != last_sum); - return sum; + return static_cast<U>(sum); +} +#endif + +/* This is the normalized cardinal sine (sinc) function. + * + * sinc(x) = { 1, x = 0 + * { sin(pi x) / (pi x), otherwise. + */ +double Sinc(const double x) +{ + if(std::abs(x) < Epsilon) UNLIKELY + return 1.0; + return std::sin(al::numbers::pi*x) / (al::numbers::pi*x); } /* Calculate a Kaiser window from the given beta value and a normalized k @@ -67,23 +84,11 @@ constexpr double BesselI_0(const double x) * * k = 2 i / M - 1, where 0 <= i <= M. */ -double Kaiser(const double b, const double k) +double Kaiser(const double beta, const double k, const double besseli_0_beta) { if(!(k >= -1.0 && k <= 1.0)) return 0.0; - return BesselI_0(b * std::sqrt(1.0 - k*k)) / BesselI_0(b); -} - -// Calculates the greatest common divisor of a and b. -constexpr uint Gcd(uint x, uint y) -{ - while(y > 0) - { - const uint z{y}; - y = x % y; - x = z; - } - return x; + return cyl_bessel_i(0, beta * std::sqrt(1.0 - k*k)) / besseli_0_beta; } /* Calculates the size (order) of the Kaiser window. Rejection is in dB and @@ -124,11 +129,11 @@ constexpr double CalcKaiserBeta(const double rejection) * p -- gain compensation factor when sampling * f_t -- normalized center frequency (or cutoff; 0.5 is nyquist) */ -double SincFilter(const uint l, const double b, const double gain, const double cutoff, - const uint i) +double SincFilter(const uint l, const double beta, const double besseli_0_beta, const double gain, + const double cutoff, const uint i) { const double x{static_cast<double>(i) - l}; - return Kaiser(b, x / l) * 2.0 * gain * cutoff * Sinc(2.0 * cutoff * x); + return Kaiser(beta, x/l, besseli_0_beta) * 2.0 * gain * cutoff * Sinc(2.0 * cutoff * x); } } // namespace @@ -137,7 +142,7 @@ double SincFilter(const uint l, const double b, const double gain, const double // that's used to cut frequencies above the destination nyquist. void PPhaseResampler::init(const uint srcRate, const uint dstRate) { - const uint gcd{Gcd(srcRate, dstRate)}; + const uint gcd{std::gcd(srcRate, dstRate)}; mP = dstRate / gcd; mQ = srcRate / gcd; @@ -160,11 +165,12 @@ void PPhaseResampler::init(const uint srcRate, const uint dstRate) // calculating the left offset to avoid increasing the transition width. const uint l{(CalcKaiserOrder(180.0, width)+1) / 2}; const double beta{CalcKaiserBeta(180.0)}; + const double besseli_0_beta{cyl_bessel_i(0, beta)}; mM = l*2 + 1; mL = l; mF.resize(mM); for(uint i{0};i < mM;i++) - mF[i] = SincFilter(l, beta, mP, cutoff, i); + mF[i] = SincFilter(l, beta, besseli_0_beta, mP, cutoff, i); } // Perform the upsample-filter-downsample resampling operation using a diff --git a/common/ringbuffer.cpp b/common/ringbuffer.cpp index 0aec1d49..af1f3669 100644 --- a/common/ringbuffer.cpp +++ b/common/ringbuffer.cpp @@ -29,9 +29,9 @@ #include "almalloc.h" -RingBufferPtr RingBuffer::Create(size_t sz, size_t elem_sz, int limit_writes) +RingBufferPtr RingBuffer::Create(std::size_t sz, std::size_t elem_sz, int limit_writes) { - size_t power_of_two{0u}; + std::size_t power_of_two{0u}; if(sz > 0) { power_of_two = sz; @@ -40,15 +40,14 @@ RingBufferPtr RingBuffer::Create(size_t sz, size_t elem_sz, int limit_writes) power_of_two |= power_of_two>>4; power_of_two |= power_of_two>>8; power_of_two |= power_of_two>>16; -#if SIZE_MAX > UINT_MAX - power_of_two |= power_of_two>>32; -#endif + if constexpr(SIZE_MAX > UINT_MAX) + power_of_two |= power_of_two>>32; } ++power_of_two; - if(power_of_two <= sz || power_of_two > std::numeric_limits<size_t>::max()/elem_sz) + if(power_of_two <= sz || power_of_two > std::numeric_limits<std::size_t>::max()/elem_sz) throw std::overflow_error{"Ring buffer size overflow"}; - const size_t bufbytes{power_of_two * elem_sz}; + const std::size_t bufbytes{power_of_two * elem_sz}; RingBufferPtr rb{new(FamCount(bufbytes)) RingBuffer{bufbytes}}; rb->mWriteSize = limit_writes ? sz : (power_of_two-1); rb->mSizeMask = power_of_two - 1; @@ -61,20 +60,20 @@ void RingBuffer::reset() noexcept { mWritePtr.store(0, std::memory_order_relaxed); mReadPtr.store(0, std::memory_order_relaxed); - std::fill_n(mBuffer.begin(), (mSizeMask+1)*mElemSize, al::byte{}); + std::fill_n(mBuffer.begin(), (mSizeMask+1)*mElemSize, std::byte{}); } -size_t RingBuffer::read(void *dest, size_t cnt) noexcept +std::size_t RingBuffer::read(void *dest, std::size_t cnt) noexcept { - const size_t free_cnt{readSpace()}; + const std::size_t free_cnt{readSpace()}; if(free_cnt == 0) return 0; - const size_t to_read{std::min(cnt, free_cnt)}; - size_t read_ptr{mReadPtr.load(std::memory_order_relaxed) & mSizeMask}; + const std::size_t to_read{std::min(cnt, free_cnt)}; + std::size_t read_ptr{mReadPtr.load(std::memory_order_relaxed) & mSizeMask}; - size_t n1, n2; - const size_t cnt2{read_ptr + to_read}; + std::size_t n1, n2; + const std::size_t cnt2{read_ptr + to_read}; if(cnt2 > mSizeMask+1) { n1 = mSizeMask+1 - read_ptr; @@ -87,7 +86,7 @@ size_t RingBuffer::read(void *dest, size_t cnt) noexcept } auto outiter = std::copy_n(mBuffer.begin() + read_ptr*mElemSize, n1*mElemSize, - static_cast<al::byte*>(dest)); + static_cast<std::byte*>(dest)); read_ptr += n1; if(n2 > 0) { @@ -98,16 +97,16 @@ size_t RingBuffer::read(void *dest, size_t cnt) noexcept return to_read; } -size_t RingBuffer::peek(void *dest, size_t cnt) const noexcept +std::size_t RingBuffer::peek(void *dest, std::size_t cnt) const noexcept { - const size_t free_cnt{readSpace()}; + const std::size_t free_cnt{readSpace()}; if(free_cnt == 0) return 0; - const size_t to_read{std::min(cnt, free_cnt)}; - size_t read_ptr{mReadPtr.load(std::memory_order_relaxed) & mSizeMask}; + const std::size_t to_read{std::min(cnt, free_cnt)}; + std::size_t read_ptr{mReadPtr.load(std::memory_order_relaxed) & mSizeMask}; - size_t n1, n2; - const size_t cnt2{read_ptr + to_read}; + std::size_t n1, n2; + const std::size_t cnt2{read_ptr + to_read}; if(cnt2 > mSizeMask+1) { n1 = mSizeMask+1 - read_ptr; @@ -120,22 +119,22 @@ size_t RingBuffer::peek(void *dest, size_t cnt) const noexcept } auto outiter = std::copy_n(mBuffer.begin() + read_ptr*mElemSize, n1*mElemSize, - static_cast<al::byte*>(dest)); + static_cast<std::byte*>(dest)); if(n2 > 0) std::copy_n(mBuffer.begin(), n2*mElemSize, outiter); return to_read; } -size_t RingBuffer::write(const void *src, size_t cnt) noexcept +std::size_t RingBuffer::write(const void *src, std::size_t cnt) noexcept { - const size_t free_cnt{writeSpace()}; + const std::size_t free_cnt{writeSpace()}; if(free_cnt == 0) return 0; - const size_t to_write{std::min(cnt, free_cnt)}; - size_t write_ptr{mWritePtr.load(std::memory_order_relaxed) & mSizeMask}; + const std::size_t to_write{std::min(cnt, free_cnt)}; + std::size_t write_ptr{mWritePtr.load(std::memory_order_relaxed) & mSizeMask}; - size_t n1, n2; - const size_t cnt2{write_ptr + to_write}; + std::size_t n1, n2; + const std::size_t cnt2{write_ptr + to_write}; if(cnt2 > mSizeMask+1) { n1 = mSizeMask+1 - write_ptr; @@ -147,7 +146,7 @@ size_t RingBuffer::write(const void *src, size_t cnt) noexcept n2 = 0; } - auto srcbytes = static_cast<const al::byte*>(src); + auto srcbytes = static_cast<const std::byte*>(src); std::copy_n(srcbytes, n1*mElemSize, mBuffer.begin() + write_ptr*mElemSize); write_ptr += n1; if(n2 > 0) @@ -164,26 +163,26 @@ auto RingBuffer::getReadVector() const noexcept -> DataPair { DataPair ret; - size_t w{mWritePtr.load(std::memory_order_acquire)}; - size_t r{mReadPtr.load(std::memory_order_acquire)}; + std::size_t w{mWritePtr.load(std::memory_order_acquire)}; + std::size_t r{mReadPtr.load(std::memory_order_acquire)}; w &= mSizeMask; r &= mSizeMask; - const size_t free_cnt{(w-r) & mSizeMask}; + const std::size_t free_cnt{(w-r) & mSizeMask}; - const size_t cnt2{r + free_cnt}; + const std::size_t cnt2{r + free_cnt}; if(cnt2 > mSizeMask+1) { /* Two part vector: the rest of the buffer after the current read ptr, * plus some from the start of the buffer. */ - ret.first.buf = const_cast<al::byte*>(mBuffer.data() + r*mElemSize); + ret.first.buf = const_cast<std::byte*>(mBuffer.data() + r*mElemSize); ret.first.len = mSizeMask+1 - r; - ret.second.buf = const_cast<al::byte*>(mBuffer.data()); + ret.second.buf = const_cast<std::byte*>(mBuffer.data()); ret.second.len = cnt2 & mSizeMask; } else { /* Single part vector: just the rest of the buffer */ - ret.first.buf = const_cast<al::byte*>(mBuffer.data() + r*mElemSize); + ret.first.buf = const_cast<std::byte*>(mBuffer.data() + r*mElemSize); ret.first.len = free_cnt; ret.second.buf = nullptr; ret.second.len = 0; @@ -196,25 +195,25 @@ auto RingBuffer::getWriteVector() const noexcept -> DataPair { DataPair ret; - size_t w{mWritePtr.load(std::memory_order_acquire)}; - size_t r{mReadPtr.load(std::memory_order_acquire) + mWriteSize - mSizeMask}; + std::size_t w{mWritePtr.load(std::memory_order_acquire)}; + std::size_t r{mReadPtr.load(std::memory_order_acquire) + mWriteSize - mSizeMask}; w &= mSizeMask; r &= mSizeMask; - const size_t free_cnt{(r-w-1) & mSizeMask}; + const std::size_t free_cnt{(r-w-1) & mSizeMask}; - const size_t cnt2{w + free_cnt}; + const std::size_t cnt2{w + free_cnt}; if(cnt2 > mSizeMask+1) { /* Two part vector: the rest of the buffer after the current write ptr, * plus some from the start of the buffer. */ - ret.first.buf = const_cast<al::byte*>(mBuffer.data() + w*mElemSize); + ret.first.buf = const_cast<std::byte*>(mBuffer.data() + w*mElemSize); ret.first.len = mSizeMask+1 - w; - ret.second.buf = const_cast<al::byte*>(mBuffer.data()); + ret.second.buf = const_cast<std::byte*>(mBuffer.data()); ret.second.len = cnt2 & mSizeMask; } else { - ret.first.buf = const_cast<al::byte*>(mBuffer.data() + w*mElemSize); + ret.first.buf = const_cast<std::byte*>(mBuffer.data() + w*mElemSize); ret.first.len = free_cnt; ret.second.buf = nullptr; ret.second.len = 0; diff --git a/common/ringbuffer.h b/common/ringbuffer.h index 2a3797b0..8c65c3af 100644 --- a/common/ringbuffer.h +++ b/common/ringbuffer.h @@ -2,11 +2,10 @@ #define RINGBUFFER_H #include <atomic> +#include <cstddef> #include <memory> -#include <stddef.h> #include <utility> -#include "albyte.h" #include "almalloc.h" @@ -18,23 +17,23 @@ struct RingBuffer { private: - std::atomic<size_t> mWritePtr{0u}; - std::atomic<size_t> mReadPtr{0u}; - size_t mWriteSize{0u}; - size_t mSizeMask{0u}; - size_t mElemSize{0u}; + std::atomic<std::size_t> mWritePtr{0u}; + std::atomic<std::size_t> mReadPtr{0u}; + std::size_t mWriteSize{0u}; + std::size_t mSizeMask{0u}; + std::size_t mElemSize{0u}; - al::FlexArray<al::byte, 16> mBuffer; + al::FlexArray<std::byte, 16> mBuffer; public: struct Data { - al::byte *buf; - size_t len; + std::byte *buf; + std::size_t len; }; using DataPair = std::pair<Data,Data>; - RingBuffer(const size_t count) : mBuffer{count} { } + RingBuffer(const std::size_t count) : mBuffer{count} { } /** Reset the read and write pointers to zero. This is not thread safe. */ void reset() noexcept; @@ -56,7 +55,7 @@ public: * Return the number of elements available for reading. This is the number * of elements in front of the read pointer and behind the write pointer. */ - size_t readSpace() const noexcept + std::size_t readSpace() const noexcept { const size_t w{mWritePtr.load(std::memory_order_acquire)}; const size_t r{mReadPtr.load(std::memory_order_acquire)}; @@ -67,14 +66,14 @@ public: * The copying data reader. Copy at most `cnt' elements into `dest'. * Returns the actual number of elements copied. */ - size_t read(void *dest, size_t cnt) noexcept; + std::size_t read(void *dest, std::size_t cnt) noexcept; /** * The copying data reader w/o read pointer advance. Copy at most `cnt' * elements into `dest'. Returns the actual number of elements copied. */ - size_t peek(void *dest, size_t cnt) const noexcept; + std::size_t peek(void *dest, std::size_t cnt) const noexcept; /** Advance the read pointer `cnt' places. */ - void readAdvance(size_t cnt) noexcept + void readAdvance(std::size_t cnt) noexcept { mReadPtr.fetch_add(cnt, std::memory_order_acq_rel); } @@ -82,7 +81,7 @@ public: * Return the number of elements available for writing. This is the number * of elements in front of the write pointer and behind the read pointer. */ - size_t writeSpace() const noexcept + std::size_t writeSpace() const noexcept { const size_t w{mWritePtr.load(std::memory_order_acquire)}; const size_t r{mReadPtr.load(std::memory_order_acquire) + mWriteSize - mSizeMask}; @@ -93,12 +92,12 @@ public: * The copying data writer. Copy at most `cnt' elements from `src'. Returns * the actual number of elements copied. */ - size_t write(const void *src, size_t cnt) noexcept; + std::size_t write(const void *src, std::size_t cnt) noexcept; /** Advance the write pointer `cnt' places. */ - void writeAdvance(size_t cnt) noexcept + void writeAdvance(std::size_t cnt) noexcept { mWritePtr.fetch_add(cnt, std::memory_order_acq_rel); } - size_t getElemSize() const noexcept { return mElemSize; } + std::size_t getElemSize() const noexcept { return mElemSize; } /** * Create a new ringbuffer to hold at least `sz' elements of `elem_sz' @@ -106,7 +105,7 @@ public: * (even if it is already a power of two, to ensure the requested amount * can be written). */ - static std::unique_ptr<RingBuffer> Create(size_t sz, size_t elem_sz, int limit_writes); + static std::unique_ptr<RingBuffer> Create(std::size_t sz, std::size_t elem_sz, int limit_writes); DEF_FAM_NEWDEL(RingBuffer, mBuffer) }; diff --git a/common/strutils.cpp b/common/strutils.cpp index d0418eff..f7868e2e 100644 --- a/common/strutils.cpp +++ b/common/strutils.cpp @@ -10,31 +10,32 @@ #define WIN32_LEAN_AND_MEAN #include <windows.h> -std::string wstr_to_utf8(const WCHAR *wstr) +std::string wstr_to_utf8(std::wstring_view wstr) { std::string ret; - int len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr); + int len{WideCharToMultiByte(CP_UTF8, 0, wstr.data(), static_cast<int>(wstr.length()), nullptr, + 0, nullptr, nullptr)}; if(len > 0) { ret.resize(len); - WideCharToMultiByte(CP_UTF8, 0, wstr, -1, &ret[0], len, nullptr, nullptr); - ret.pop_back(); + WideCharToMultiByte(CP_UTF8, 0, wstr.data(), static_cast<int>(wstr.length()), &ret[0], len, + nullptr, nullptr); } return ret; } -std::wstring utf8_to_wstr(const char *str) +std::wstring utf8_to_wstr(std::string_view str) { std::wstring ret; - int len = MultiByteToWideChar(CP_UTF8, 0, str, -1, nullptr, 0); + int len{MultiByteToWideChar(CP_UTF8, 0, str.data(), static_cast<int>(str.length()), nullptr, + 0)}; if(len > 0) { ret.resize(len); - MultiByteToWideChar(CP_UTF8, 0, str, -1, &ret[0], len); - ret.pop_back(); + MultiByteToWideChar(CP_UTF8, 0, str.data(), static_cast<int>(str.length()), &ret[0], len); } return ret; @@ -43,21 +44,25 @@ std::wstring utf8_to_wstr(const char *str) namespace al { -al::optional<std::string> getenv(const char *envname) +std::optional<std::string> getenv(const char *envname) { +#ifdef _GAMING_XBOX + const char *str{::getenv(envname)}; +#else const char *str{std::getenv(envname)}; +#endif if(str && str[0] != '\0') return str; - return al::nullopt; + return std::nullopt; } #ifdef _WIN32 -al::optional<std::wstring> getenv(const WCHAR *envname) +std::optional<std::wstring> getenv(const WCHAR *envname) { const WCHAR *str{_wgetenv(envname)}; if(str && str[0] != L'\0') return str; - return al::nullopt; + return std::nullopt; } #endif diff --git a/common/strutils.h b/common/strutils.h index 0c7a0e22..7eee0c1d 100644 --- a/common/strutils.h +++ b/common/strutils.h @@ -1,22 +1,22 @@ #ifndef AL_STRUTILS_H #define AL_STRUTILS_H +#include <optional> #include <string> -#include "aloptional.h" - #ifdef _WIN32 +#include <string_view> #include <wchar.h> -std::string wstr_to_utf8(const wchar_t *wstr); -std::wstring utf8_to_wstr(const char *str); +std::string wstr_to_utf8(std::wstring_view wstr); +std::wstring utf8_to_wstr(std::string_view str); #endif namespace al { -al::optional<std::string> getenv(const char *envname); +std::optional<std::string> getenv(const char *envname); #ifdef _WIN32 -al::optional<std::wstring> getenv(const wchar_t *envname); +std::optional<std::wstring> getenv(const wchar_t *envname); #endif } // namespace al diff --git a/common/win_main_utf8.h b/common/win_main_utf8.h index 077af533..81734242 100644 --- a/common/win_main_utf8.h +++ b/common/win_main_utf8.h @@ -20,14 +20,20 @@ #define STATIC_CAST(...) static_cast<__VA_ARGS__> #define REINTERPRET_CAST(...) reinterpret_cast<__VA_ARGS__> +#define MAYBE_UNUSED [[maybe_unused]] #else #define STATIC_CAST(...) (__VA_ARGS__) #define REINTERPRET_CAST(...) (__VA_ARGS__) +#ifdef __GNUC__ +#define MAYBE_UNUSED __attribute__((__unused__)) +#else +#define MAYBE_UNUSED +#endif #endif -static FILE *my_fopen(const char *fname, const char *mode) +MAYBE_UNUSED static FILE *my_fopen(const char *fname, const char *mode) { wchar_t *wname=NULL, *wmode=NULL; int namelen, modelen; @@ -44,10 +50,11 @@ static FILE *my_fopen(const char *fname, const char *mode) } #ifdef __cplusplus - auto strbuf = std::make_unique<wchar_t[]>(static_cast<size_t>(namelen)+modelen); + auto strbuf = std::make_unique<wchar_t[]>(static_cast<size_t>(namelen) + + static_cast<size_t>(modelen)); wname = strbuf.get(); #else - wname = (wchar_t*)calloc(sizeof(wchar_t), (size_t)namelen + modelen); + wname = (wchar_t*)calloc(sizeof(wchar_t), (size_t)namelen + (size_t)modelen); #endif wmode = wname + namelen; MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, namelen); |