From 423888b8162f9d78ec4f1b00b8368f3ba22eedd6 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Thu, 27 Jan 2022 00:39:36 -0800
Subject: Use precalculated lookup tables to swap FFT elements

Rather than going through the whole array, calculating the bit-reversed index
of each element, and not doing anything for more than half of them.
---
 common/alcomplex.cpp | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 7 deletions(-)

(limited to 'common/alcomplex.cpp')

diff --git a/common/alcomplex.cpp b/common/alcomplex.cpp
index de10ede2..5cb35f38 100644
--- a/common/alcomplex.cpp
+++ b/common/alcomplex.cpp
@@ -4,6 +4,7 @@
 #include "alcomplex.h"
 
 #include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <utility>
@@ -11,29 +12,112 @@
 #include "albit.h"
 #include "alnumeric.h"
 #include "math_defs.h"
+#include "opthelpers.h"
 
 
-void complex_fft(const al::span<std::complex<double>> buffer, const double sign)
+namespace {
+
+using ushort = unsigned short;
+using ushort2 = std::pair<ushort,ushort>;
+
+/* Because std::array doesn't have constexpr non-const accessors in C++14. */
+template<typename T, size_t N>
+struct our_array {
+    T mData[N];
+};
+
+constexpr size_t BitReverseCounter(size_t log2_size) noexcept
 {
-    const size_t fftsize{buffer.size()};
-    /* Get the number of bits used for indexing. Simplifies bit-reversal and
-     * the main loop count.
+    /* Some magic math that calculates the number of swaps needed for a
+     * sequence of bit-reversed indices when index < reversed_index.
      */
-    const size_t log2_size{static_cast<size_t>(al::countr_zero(fftsize))};
+    return (1u<<(log2_size-1)) - (1u<<((log2_size-1u)/2u));
+}
+
+template<size_t N>
+constexpr auto GetBitReverser() noexcept
+{
+    static_assert(N <= sizeof(ushort)*8, "Too many bits for the bit-reversal table.");
+
+    our_array<ushort2, BitReverseCounter(N)> ret{};
+    const size_t fftsize{1u << N};
+    size_t ret_i{0};
 
     /* Bit-reversal permutation applied to a sequence of fftsize items. */
     for(size_t idx{1u};idx < fftsize-1;++idx)
     {
         size_t revidx{0u}, imask{idx};
-        for(size_t i{0};i < log2_size;++i)
+        for(size_t i{0};i < N;++i)
         {
             revidx = (revidx<<1) | (imask&1);
             imask >>= 1;
         }
 
         if(idx < revidx)
-            std::swap(buffer[idx], buffer[revidx]);
+        {
+            ret.mData[ret_i].first  = static_cast<ushort>(idx);
+            ret.mData[ret_i].second = static_cast<ushort>(revidx);
+            ++ret_i;
+        }
+    }
+    assert(ret_i == al::size(ret.mData));
+    return ret;
+}
+
+/* These bit-reversal swap tables support up to 10-bit indices (1024 elements),
+ * which is the largest used by OpenAL Soft's filters and effects. Larger FFT
+ * requests, used by some utilities where performance is less important, will
+ * use a slower table-less path.
+ */
+constexpr auto BitReverser2 = GetBitReverser<2>();
+constexpr auto BitReverser3 = GetBitReverser<3>();
+constexpr auto BitReverser4 = GetBitReverser<4>();
+constexpr auto BitReverser5 = GetBitReverser<5>();
+constexpr auto BitReverser6 = GetBitReverser<6>();
+constexpr auto BitReverser7 = GetBitReverser<7>();
+constexpr auto BitReverser8 = GetBitReverser<8>();
+constexpr auto BitReverser9 = GetBitReverser<9>();
+constexpr auto BitReverser10 = GetBitReverser<10>();
+constexpr al::span<const ushort2> gBitReverses[11]{
+    {}, {},
+    BitReverser2.mData,
+    BitReverser3.mData,
+    BitReverser4.mData,
+    BitReverser5.mData,
+    BitReverser6.mData,
+    BitReverser7.mData,
+    BitReverser8.mData,
+    BitReverser9.mData,
+    BitReverser10.mData
+};
+
+} // namespace
+
+void complex_fft(const al::span<std::complex<double>> buffer, const double sign)
+{
+    const size_t fftsize{buffer.size()};
+    /* Get the number of bits used for indexing. Simplifies bit-reversal and
+     * the main loop count.
+     */
+    const size_t log2_size{static_cast<size_t>(al::countr_zero(fftsize))};
+
+    if(unlikely(log2_size >= al::size(gBitReverses)))
+    {
+        for(size_t idx{1u};idx < fftsize-1;++idx)
+        {
+            size_t revidx{0u}, imask{idx};
+            for(size_t i{0};i < log2_size;++i)
+            {
+                revidx = (revidx<<1) | (imask&1);
+                imask >>= 1;
+            }
+
+            if(idx < revidx)
+                std::swap(buffer[idx], buffer[revidx]);
+        }
     }
+    else for(auto &rev : gBitReverses[log2_size])
+        std::swap(buffer[rev.first], buffer[rev.second]);
 
     /* Iterative form of Danielson-Lanczos lemma */
     size_t step2{1u};
-- 
cgit v1.2.3